Source code for etna.analysis.feature_relevance.relevance_table

import warnings
from typing import List
from typing import Union

import numpy as np
import pandas as pd
from catboost import CatBoostRegressor
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.tree import ExtraTreeRegressor

from etna.libs.tsfresh import calculate_relevance_table

TreeBasedRegressor = Union[
    DecisionTreeRegressor,
    ExtraTreeRegressor,
    RandomForestRegressor,
    ExtraTreesRegressor,
    GradientBoostingRegressor,
    CatBoostRegressor,
]


[docs]def _prepare_df(df: pd.DataFrame, df_exog: pd.DataFrame, segment: str, regressors: List[str]): """Drop nan values from dataframes for the segment.""" first_valid_idx = df.loc[:, segment].first_valid_index() df_exog_seg = df_exog.loc[first_valid_idx:, segment].dropna()[regressors] df_seg = df.loc[first_valid_idx:, segment].dropna()["target"] common_index = df_seg.index.intersection(df_exog_seg.index) if len(common_index) < len(df.loc[first_valid_idx:, segment]): warnings.warn("Exogenous or target data contains None! It will be dropped for calculating relevance.") return df_seg.loc[common_index], df_exog_seg.loc[common_index]
[docs]def get_statistics_relevance_table(df: pd.DataFrame, df_exog: pd.DataFrame) -> pd.DataFrame: """Calculate relevance table with p-values from tsfresh. Parameters ---------- df: dataframe with timeseries df_exog: dataframe with exogenous data Returns ------- pd.DataFrame dataframe with p-values. Notes ----- Time complexity of this method is :math:`O(n\_segments * n\_features * history\_len)` """ regressors = sorted(df_exog.columns.get_level_values("feature").unique()) segments = sorted(df.columns.get_level_values("segment").unique()) result = np.empty((len(segments), len(regressors))) for k, seg in enumerate(segments): df_seg, df_exog_seg = _prepare_df(df=df, df_exog=df_exog, segment=seg, regressors=regressors) cat_cols = df_exog_seg.dtypes[df_exog_seg.dtypes == "category"].index for cat_col in cat_cols: try: df_exog_seg[cat_col] = df_exog_seg[cat_col].astype(float) except ValueError: raise ValueError(f"{cat_col} column cannot be cast to float type! Please, use encoders.") warnings.warn( "Exogenous data contains columns with category type! It will be converted to float. If this is not desired behavior, use encoders." ) relevance = calculate_relevance_table(X=df_exog_seg, y=df_seg, ml_task="regression")[ ["feature", "p_value"] ].values result[k] = np.array(sorted(relevance, key=lambda x: x[0]))[:, 1] relevance_table = pd.DataFrame(result) relevance_table.index = segments relevance_table.columns = regressors return relevance_table
[docs]def get_model_relevance_table(df: pd.DataFrame, df_exog: pd.DataFrame, model: TreeBasedRegressor) -> pd.DataFrame: """Calculate relevance table with feature importance from model. Parameters ---------- df: dataframe with timeseries df_exog: dataframe with exogenous data model: model to obtain feature importance, should have ``feature_importances_`` property Returns ------- pd.DataFrame dataframe with feature importance values. """ regressors = sorted(df_exog.columns.get_level_values("feature").unique()) segments = sorted(df.columns.get_level_values("segment").unique()) result = np.empty((len(segments), len(regressors))) for k, seg in enumerate(segments): df_seg, df_exog_seg = _prepare_df(df=df, df_exog=df_exog, segment=seg, regressors=regressors) model.fit(X=df_exog_seg, y=df_seg) result[k] = model.feature_importances_ relevance_table = pd.DataFrame(result) relevance_table.index = segments relevance_table.columns = regressors return relevance_table