|
|
@@ -0,0 +1,533 @@
|
|
|
+# copyright (c) 2024 PaddlePaddle Authors. All Rights Reserve.
|
|
|
+#
|
|
|
+# Licensed under the Apache License, Version 2.0 (the "License");
|
|
|
+# you may not use this file except in compliance with the License.
|
|
|
+# You may obtain a copy of the License at
|
|
|
+#
|
|
|
+# http://www.apache.org/licenses/LICENSE-2.0
|
|
|
+#
|
|
|
+# Unless required by applicable law or agreed to in writing, software
|
|
|
+# distributed under the License is distributed on an "AS IS" BASIS,
|
|
|
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
|
+# See the License for the specific language governing permissions and
|
|
|
+# limitations under the License.
|
|
|
+
|
|
|
+
|
|
|
+from typing import Any, Callable, List, Optional, Sequence, Tuple, Union, Dict
|
|
|
+import os
|
|
|
+import numpy as np
|
|
|
+import pandas as pd
|
|
|
+import chinese_calendar
|
|
|
+from pandas.tseries.offsets import DateOffset, Easter, Day
|
|
|
+from pandas.tseries import holiday as hd
|
|
|
+from sklearn.preprocessing import StandardScaler
|
|
|
+
|
|
|
+
|
|
|
+MAX_WINDOW = 183 + 17
|
|
|
+EasterSunday = hd.Holiday("Easter Sunday", month=1, day=1, offset=[Easter(), Day(0)])
|
|
|
+NewYearsDay = hd.Holiday("New Years Day", month=1, day=1)
|
|
|
+SuperBowl = hd.Holiday("Superbowl", month=2, day=1, offset=DateOffset(weekday=hd.SU(1)))
|
|
|
+MothersDay = hd.Holiday(
|
|
|
+ "Mothers Day", month=5, day=1, offset=DateOffset(weekday=hd.SU(2))
|
|
|
+)
|
|
|
+IndependenceDay = hd.Holiday("Independence Day", month=7, day=4)
|
|
|
+ChristmasEve = hd.Holiday("Christmas", month=12, day=24)
|
|
|
+ChristmasDay = hd.Holiday("Christmas", month=12, day=25)
|
|
|
+NewYearsEve = hd.Holiday("New Years Eve", month=12, day=31)
|
|
|
+BlackFriday = hd.Holiday(
|
|
|
+ "Black Friday",
|
|
|
+ month=11,
|
|
|
+ day=1,
|
|
|
+ offset=[pd.DateOffset(weekday=hd.TH(4)), Day(1)],
|
|
|
+)
|
|
|
+CyberMonday = hd.Holiday(
|
|
|
+ "Cyber Monday",
|
|
|
+ month=11,
|
|
|
+ day=1,
|
|
|
+ offset=[pd.DateOffset(weekday=hd.TH(4)), Day(4)],
|
|
|
+)
|
|
|
+
|
|
|
+HOLIDAYS = [
|
|
|
+ hd.EasterMonday,
|
|
|
+ hd.GoodFriday,
|
|
|
+ hd.USColumbusDay,
|
|
|
+ hd.USLaborDay,
|
|
|
+ hd.USMartinLutherKingJr,
|
|
|
+ hd.USMemorialDay,
|
|
|
+ hd.USPresidentsDay,
|
|
|
+ hd.USThanksgivingDay,
|
|
|
+ EasterSunday,
|
|
|
+ NewYearsDay,
|
|
|
+ SuperBowl,
|
|
|
+ MothersDay,
|
|
|
+ IndependenceDay,
|
|
|
+ ChristmasEve,
|
|
|
+ ChristmasDay,
|
|
|
+ NewYearsEve,
|
|
|
+ BlackFriday,
|
|
|
+ CyberMonday,
|
|
|
+]
|
|
|
+
|
|
|
+
|
|
|
+def _cal_year(
|
|
|
+ x: np.datetime64,
|
|
|
+):
|
|
|
+ return x.year
|
|
|
+
|
|
|
+
|
|
|
+def _cal_month(
|
|
|
+ x: np.datetime64,
|
|
|
+):
|
|
|
+ return x.month
|
|
|
+
|
|
|
+
|
|
|
+def _cal_day(
|
|
|
+ x: np.datetime64,
|
|
|
+):
|
|
|
+ return x.day
|
|
|
+
|
|
|
+
|
|
|
+def _cal_hour(
|
|
|
+ x: np.datetime64,
|
|
|
+):
|
|
|
+ return x.hour
|
|
|
+
|
|
|
+
|
|
|
+def _cal_weekday(
|
|
|
+ x: np.datetime64,
|
|
|
+):
|
|
|
+ return x.dayofweek
|
|
|
+
|
|
|
+
|
|
|
+def _cal_quarter(
|
|
|
+ x: np.datetime64,
|
|
|
+):
|
|
|
+ return x.quarter
|
|
|
+
|
|
|
+
|
|
|
+def _cal_hourofday(
|
|
|
+ x: np.datetime64,
|
|
|
+):
|
|
|
+ return x.hour / 23.0 - 0.5
|
|
|
+
|
|
|
+
|
|
|
+def _cal_dayofweek(
|
|
|
+ x: np.datetime64,
|
|
|
+):
|
|
|
+ return x.dayofweek / 6.0 - 0.5
|
|
|
+
|
|
|
+
|
|
|
+def _cal_dayofmonth(
|
|
|
+ x: np.datetime64,
|
|
|
+):
|
|
|
+ return x.day / 30.0 - 0.5
|
|
|
+
|
|
|
+
|
|
|
+def _cal_dayofyear(
|
|
|
+ x: np.datetime64,
|
|
|
+):
|
|
|
+ return x.dayofyear / 364.0 - 0.5
|
|
|
+
|
|
|
+
|
|
|
+def _cal_weekofyear(
|
|
|
+ x: np.datetime64,
|
|
|
+):
|
|
|
+ return x.weekofyear / 51.0 - 0.5
|
|
|
+
|
|
|
+
|
|
|
+def _cal_holiday(
|
|
|
+ x: np.datetime64,
|
|
|
+):
|
|
|
+ return float(chinese_calendar.is_holiday(x))
|
|
|
+
|
|
|
+
|
|
|
+def _cal_workday(
|
|
|
+ x: np.datetime64,
|
|
|
+):
|
|
|
+ return float(chinese_calendar.is_workday(x))
|
|
|
+
|
|
|
+
|
|
|
+def _cal_minuteofhour(
|
|
|
+ x: np.datetime64,
|
|
|
+):
|
|
|
+ return x.minute / 59 - 0.5
|
|
|
+
|
|
|
+
|
|
|
+def _cal_monthofyear(
|
|
|
+ x: np.datetime64,
|
|
|
+):
|
|
|
+ return x.month / 11.0 - 0.5
|
|
|
+
|
|
|
+
|
|
|
+CAL_DATE_METHOD = {
|
|
|
+ "year": _cal_year,
|
|
|
+ "month": _cal_month,
|
|
|
+ "day": _cal_day,
|
|
|
+ "hour": _cal_hour,
|
|
|
+ "weekday": _cal_weekday,
|
|
|
+ "quarter": _cal_quarter,
|
|
|
+ "minuteofhour": _cal_minuteofhour,
|
|
|
+ "monthofyear": _cal_monthofyear,
|
|
|
+ "hourofday": _cal_hourofday,
|
|
|
+ "dayofweek": _cal_dayofweek,
|
|
|
+ "dayofmonth": _cal_dayofmonth,
|
|
|
+ "dayofyear": _cal_dayofyear,
|
|
|
+ "weekofyear": _cal_weekofyear,
|
|
|
+ "is_holiday": _cal_holiday,
|
|
|
+ "is_workday": _cal_workday,
|
|
|
+}
|
|
|
+
|
|
|
+
|
|
|
+def load_from_one_dataframe(
|
|
|
+ data: Union[pd.DataFrame, pd.Series],
|
|
|
+ time_col: Optional[str] = None,
|
|
|
+ value_cols: Optional[Union[List[str], str]] = None,
|
|
|
+ freq: Optional[Union[str, int]] = None,
|
|
|
+ drop_tail_nan: bool = False,
|
|
|
+ dtype: Optional[Union[type, Dict[str, type]]] = None,
|
|
|
+) -> pd.DataFrame:
|
|
|
+ """Transforms a DataFrame or Series into a time-indexed DataFrame.
|
|
|
+
|
|
|
+ Args:
|
|
|
+ data (Union[pd.DataFrame, pd.Series]): The input data containing time series information.
|
|
|
+ time_col (Optional[str]): The column name representing time information. If None, uses the index.
|
|
|
+ value_cols (Optional[Union[List[str], str]]): Columns to extract as values. If None, uses all except time_col.
|
|
|
+ freq (Optional[Union[str, int]]): The frequency of the time series data.
|
|
|
+ drop_tail_nan (bool): If True, drop trailing NaN values from the data.
|
|
|
+ dtype (Optional[Union[type, Dict[str, type]]]): Enforce a specific data type on the resulting DataFrame.
|
|
|
+
|
|
|
+ Returns:
|
|
|
+ pd.DataFrame: A DataFrame with time as the index and specified value columns.
|
|
|
+
|
|
|
+ Raises:
|
|
|
+ ValueError: If the time column doesn't exist, or if frequency cannot be inferred.
|
|
|
+
|
|
|
+ """
|
|
|
+ # Initialize series_data with specified value columns or all except time_col
|
|
|
+ series_data = None
|
|
|
+ if value_cols is None:
|
|
|
+ if isinstance(data, pd.Series):
|
|
|
+ series_data = data.copy()
|
|
|
+ else:
|
|
|
+ series_data = data.loc[:, data.columns != time_col].copy()
|
|
|
+ else:
|
|
|
+ series_data = data.loc[:, value_cols].copy()
|
|
|
+
|
|
|
+ # Determine the time column values
|
|
|
+ if time_col:
|
|
|
+ if time_col not in data.columns:
|
|
|
+ raise ValueError(
|
|
|
+ "The time column: {} doesn't exist in the `data`!".format(time_col)
|
|
|
+ )
|
|
|
+ time_col_vals = data.loc[:, time_col]
|
|
|
+ else:
|
|
|
+ time_col_vals = data.index
|
|
|
+
|
|
|
+ # Handle integer-based time column values when frequency is a string
|
|
|
+ if np.issubdtype(time_col_vals.dtype, np.integer) and isinstance(freq, str):
|
|
|
+ time_col_vals = time_col_vals.astype(str)
|
|
|
+
|
|
|
+ # Process integer-based time column values
|
|
|
+ if np.issubdtype(time_col_vals.dtype, np.integer):
|
|
|
+ if freq:
|
|
|
+ if not isinstance(freq, int) or freq < 1:
|
|
|
+ raise ValueError(
|
|
|
+ "The type of `freq` should be `int` when the type of `time_col` is `RangeIndex`."
|
|
|
+ )
|
|
|
+ else:
|
|
|
+ freq = 1 # Default frequency for integer index
|
|
|
+ start_idx, stop_idx = min(time_col_vals), max(time_col_vals) + freq
|
|
|
+ if (stop_idx - start_idx) / freq != len(data):
|
|
|
+ raise ValueError("The number of rows doesn't match with the RangeIndex!")
|
|
|
+ time_index = pd.RangeIndex(start=start_idx, stop=stop_idx, step=freq)
|
|
|
+
|
|
|
+ # Process datetime-like time column values
|
|
|
+ elif np.issubdtype(time_col_vals.dtype, np.object_) or np.issubdtype(
|
|
|
+ time_col_vals.dtype, np.datetime64
|
|
|
+ ):
|
|
|
+ time_col_vals = pd.to_datetime(time_col_vals, infer_datetime_format=True)
|
|
|
+ time_index = pd.DatetimeIndex(time_col_vals)
|
|
|
+ if freq:
|
|
|
+ if not isinstance(freq, str):
|
|
|
+ raise ValueError(
|
|
|
+ "The type of `freq` should be `str` when the type of `time_col` is `DatetimeIndex`."
|
|
|
+ )
|
|
|
+ else:
|
|
|
+ # Attempt to infer frequency if not provided
|
|
|
+ freq = pd.infer_freq(time_index)
|
|
|
+ if freq is None:
|
|
|
+ raise ValueError(
|
|
|
+ "Failed to infer the `freq`. A valid `freq` is required."
|
|
|
+ )
|
|
|
+ if freq[0] == "-":
|
|
|
+ freq = freq[1:]
|
|
|
+
|
|
|
+ # Raise error for unsupported time column types
|
|
|
+ else:
|
|
|
+ raise ValueError("The type of `time_col` is invalid.")
|
|
|
+
|
|
|
+ # Ensure series_data is a DataFrame
|
|
|
+ if isinstance(series_data, pd.Series):
|
|
|
+ series_data = series_data.to_frame()
|
|
|
+
|
|
|
+ # Set time index and sort data
|
|
|
+ series_data.set_index(time_index, inplace=True)
|
|
|
+ series_data.sort_index(inplace=True)
|
|
|
+ return series_data
|
|
|
+
|
|
|
+
|
|
|
+def load_from_dataframe(
|
|
|
+ df: pd.DataFrame,
|
|
|
+ group_id: Optional[str] = None,
|
|
|
+ time_col: Optional[str] = None,
|
|
|
+ target_cols: Optional[Union[List[str], str]] = None,
|
|
|
+ label_col: Optional[Union[List[str], str]] = None,
|
|
|
+ observed_cov_cols: Optional[Union[List[str], str]] = None,
|
|
|
+ feature_cols: Optional[Union[List[str], str]] = None,
|
|
|
+ known_cov_cols: Optional[Union[List[str], str]] = None,
|
|
|
+ static_cov_cols: Optional[Union[List[str], str]] = None,
|
|
|
+ freq: Optional[Union[str, int]] = None,
|
|
|
+ fill_missing_dates: bool = False,
|
|
|
+ fillna_method: str = "pre",
|
|
|
+ fillna_window_size: int = 10,
|
|
|
+ **kwargs,
|
|
|
+) -> Dict[str, Optional[Union[pd.DataFrame, Dict[str, any]]]]:
|
|
|
+ """Loads and processes time series data from a DataFrame.
|
|
|
+
|
|
|
+ This function extracts and organizes time series data from a given DataFrame.
|
|
|
+ It supports optional grouping and extraction of specific columns as features.
|
|
|
+
|
|
|
+ Args:
|
|
|
+ df (pd.DataFrame): The input DataFrame containing time series data.
|
|
|
+ group_id (Optional[str]): Column name used for grouping the data.
|
|
|
+ time_col (Optional[str]): Name of the time column.
|
|
|
+ target_cols (Optional[Union[List[str], str]]): Columns to be used as target.
|
|
|
+ label_col (Optional[Union[List[str], str]]): Columns to be used as label.
|
|
|
+ observed_cov_cols (Optional[Union[List[str], str]]): Columns for observed covariates.
|
|
|
+ feature_cols (Optional[Union[List[str], str]]): Columns to be used as features.
|
|
|
+ known_cov_cols (Optional[Union[List[str], str]]): Columns for known covariates.
|
|
|
+ static_cov_cols (Optional[Union[List[str], str]]): Columns for static covariates.
|
|
|
+ freq (Optional[Union[str, int]]): Frequency of the time series data.
|
|
|
+ fill_missing_dates (bool): Whether to fill missing dates in the time series.
|
|
|
+ fillna_method (str): Method to fill missing values ('pre' or 'post').
|
|
|
+ fillna_window_size (int): Window size for filling missing values.
|
|
|
+ **kwargs: Additional keyword arguments.
|
|
|
+
|
|
|
+ Returns:
|
|
|
+ Dict[str, Optional[Union[pd.DataFrame, Dict[str, any]]]]: A dictionary containing processed time series data.
|
|
|
+ """
|
|
|
+ # List to store DataFrames if grouping is applied
|
|
|
+ dfs = []
|
|
|
+
|
|
|
+ # Separate the DataFrame into groups if group_id is provided
|
|
|
+ if group_id is not None:
|
|
|
+ group_unique = df[group_id].unique()
|
|
|
+ for column in group_unique:
|
|
|
+ dfs.append(df[df[group_id].isin([column])])
|
|
|
+ else:
|
|
|
+ dfs = [df]
|
|
|
+
|
|
|
+ # Result list to store processed data from each group
|
|
|
+ res = []
|
|
|
+
|
|
|
+ # If label_col is provided, ensure it is a single column
|
|
|
+ if label_col:
|
|
|
+ if isinstance(label_col, str) and len(label_col) > 1:
|
|
|
+ raise ValueError("The length of label_col must be 1.")
|
|
|
+ target_cols = label_col
|
|
|
+
|
|
|
+ # If feature_cols is provided, treat it as observed_cov_cols
|
|
|
+ if feature_cols:
|
|
|
+ observed_cov_cols = feature_cols
|
|
|
+
|
|
|
+ # Process each DataFrame in the list
|
|
|
+ for df in dfs:
|
|
|
+ target = None
|
|
|
+ observed_cov = None
|
|
|
+ known_cov = None
|
|
|
+ static_cov = dict()
|
|
|
+
|
|
|
+ # If no specific columns are provided, use all columns except time_col
|
|
|
+ if not any([target_cols, observed_cov_cols, known_cov_cols, static_cov_cols]):
|
|
|
+ target = load_from_one_dataframe(
|
|
|
+ df,
|
|
|
+ time_col,
|
|
|
+ [a for a in df.columns if a != time_col],
|
|
|
+ freq,
|
|
|
+ )
|
|
|
+ else:
|
|
|
+ if target_cols:
|
|
|
+ target = load_from_one_dataframe(
|
|
|
+ df,
|
|
|
+ time_col,
|
|
|
+ target_cols,
|
|
|
+ freq,
|
|
|
+ )
|
|
|
+
|
|
|
+ if observed_cov_cols:
|
|
|
+ observed_cov = load_from_one_dataframe(
|
|
|
+ df,
|
|
|
+ time_col,
|
|
|
+ observed_cov_cols,
|
|
|
+ freq,
|
|
|
+ )
|
|
|
+
|
|
|
+ if known_cov_cols:
|
|
|
+ known_cov = load_from_one_dataframe(
|
|
|
+ df,
|
|
|
+ time_col,
|
|
|
+ known_cov_cols,
|
|
|
+ freq,
|
|
|
+ )
|
|
|
+
|
|
|
+ if static_cov_cols:
|
|
|
+ if isinstance(static_cov_cols, str):
|
|
|
+ static_cov_cols = [static_cov_cols]
|
|
|
+ for col in static_cov_cols:
|
|
|
+ if col not in df.columns or len(np.unique(df[col])) != 1:
|
|
|
+ raise ValueError(
|
|
|
+ "Static covariate columns data is not in columns or schema is not correct!"
|
|
|
+ )
|
|
|
+ static_cov[col] = df[col].iloc[0]
|
|
|
+ # Append the processed data into the results list
|
|
|
+ res.append(
|
|
|
+ {
|
|
|
+ "past_target": target,
|
|
|
+ "observed_cov_numeric": observed_cov,
|
|
|
+ "known_cov_numeric": known_cov,
|
|
|
+ "static_cov_numeric": static_cov,
|
|
|
+ }
|
|
|
+ )
|
|
|
+ # Return the first processed result
|
|
|
+ return res[0]
|
|
|
+
|
|
|
+
|
|
|
+def _distance_to_holiday(holiday) -> Callable[[pd.Timestamp], float]:
|
|
|
+ """Creates a function to calculate the distance in days to the nearest holiday.
|
|
|
+
|
|
|
+ This function generates a closure that computes the number of days from
|
|
|
+ a given date index to the nearest holiday within a defined window.
|
|
|
+
|
|
|
+ Args:
|
|
|
+ holiday: An object that provides a `dates` method, which returns the
|
|
|
+ dates of holidays within a specified range.
|
|
|
+
|
|
|
+ Returns:
|
|
|
+ Callable[[pd.Timestamp], float]: A function that takes a date index
|
|
|
+ as input and returns the distance in days to the nearest holiday.
|
|
|
+ """
|
|
|
+
|
|
|
+ def _distance_to_day(index: pd.Timestamp) -> float:
|
|
|
+ """Calculates the distance in days from a given date index to the nearest holiday.
|
|
|
+
|
|
|
+ Args:
|
|
|
+ index (pd.Timestamp): The date index for which the distance to the
|
|
|
+ nearest holiday should be calculated.
|
|
|
+
|
|
|
+ Returns:
|
|
|
+ float: The number of days to the nearest holiday.
|
|
|
+
|
|
|
+ Raises:
|
|
|
+ AssertionError: If no holiday is found within the specified window.
|
|
|
+ """
|
|
|
+ holiday_date = holiday.dates(
|
|
|
+ index - pd.Timedelta(days=MAX_WINDOW),
|
|
|
+ index + pd.Timedelta(days=MAX_WINDOW),
|
|
|
+ )
|
|
|
+ assert (
|
|
|
+ len(holiday_date) != 0
|
|
|
+ ), f"No closest holiday for the date index {index} found."
|
|
|
+ # It sometimes returns two dates if it is exactly half a year after the
|
|
|
+ # holiday. In this case, the smaller distance (182 days) is returned.
|
|
|
+ return float((index - holiday_date[0]).days)
|
|
|
+
|
|
|
+ return _distance_to_day
|
|
|
+
|
|
|
+
|
|
|
+def time_feature(
|
|
|
+ dataset: Dict,
|
|
|
+ freq: Optional[Union[str, int]],
|
|
|
+ feature_cols: List[str],
|
|
|
+ extend_points: int,
|
|
|
+ inplace: bool = False,
|
|
|
+) -> Dict:
|
|
|
+ """Transforms the time column of a dataset into time features.
|
|
|
+
|
|
|
+ This function extracts time-related features from the time column in a
|
|
|
+ dataset, optionally extending the time series for future points and
|
|
|
+ normalizing holiday distances.
|
|
|
+
|
|
|
+ Args:
|
|
|
+ dataset (Dict): Dataset to be transformed.
|
|
|
+ freq: Optional[Union[str, int]]: Frequency of the time series data. If not provided,
|
|
|
+ the frequency will be inferred.
|
|
|
+ feature_cols (List[str]): List of feature columns to be extracted.
|
|
|
+ extend_points (int): Number of future points to extend the time series.
|
|
|
+ inplace (bool): Whether to perform the transformation inplace. Default is False.
|
|
|
+
|
|
|
+ Returns:
|
|
|
+ Dict: The transformed dataset with time features added.
|
|
|
+
|
|
|
+ Raises:
|
|
|
+ ValueError: If the time column is of an integer type instead of datetime.
|
|
|
+ """
|
|
|
+ new_ts = dataset
|
|
|
+ if not inplace:
|
|
|
+ new_ts = dataset.copy()
|
|
|
+ # Get known_cov_numeric or initialize with past target index
|
|
|
+ kcov = new_ts["known_cov_numeric"]
|
|
|
+ if not kcov:
|
|
|
+ tf_kcov = new_ts["past_target"].index.to_frame()
|
|
|
+ else:
|
|
|
+ tf_kcov = kcov.index.to_frame()
|
|
|
+ time_col = tf_kcov.columns[0]
|
|
|
+ # Check if time column is of datetime type
|
|
|
+ if np.issubdtype(tf_kcov[time_col].dtype, np.integer):
|
|
|
+ raise ValueError(
|
|
|
+ "The time_col can't be the type of numpy.integer, and it must be the type of numpy.datetime64"
|
|
|
+ )
|
|
|
+ # Extend the time series if no known_cov_numeric
|
|
|
+ if not kcov:
|
|
|
+ freq = freq if freq is not None else pd.infer_freq(tf_kcov[time_col])
|
|
|
+ extend_time = pd.date_range(
|
|
|
+ start=tf_kcov[time_col][-1],
|
|
|
+ freq=freq,
|
|
|
+ periods=extend_points + 1,
|
|
|
+ closed="right",
|
|
|
+ name=time_col,
|
|
|
+ ).to_frame()
|
|
|
+ tf_kcov = pd.concat([tf_kcov, extend_time])
|
|
|
+
|
|
|
+ # Extract and add time features to known_cov_numeric
|
|
|
+ for k in feature_cols:
|
|
|
+ if k != "holidays":
|
|
|
+ v = tf_kcov[time_col].apply(lambda x: CAL_DATE_METHOD[k](x))
|
|
|
+ v.index = tf_kcov[time_col]
|
|
|
+
|
|
|
+ if new_ts["known_cov_numeric"] is None:
|
|
|
+ new_ts["known_cov_numeric"] = pd.DataFrame(v.rename(k), index=v.index)
|
|
|
+ else:
|
|
|
+ new_ts["known_cov_numeric"][k] = v.rename(k).reindex(
|
|
|
+ new_ts["known_cov_numeric"].index
|
|
|
+ )
|
|
|
+ else:
|
|
|
+ holidays_col = []
|
|
|
+ for i, H in enumerate(HOLIDAYS):
|
|
|
+ v = tf_kcov[time_col].apply(_distance_to_holiday(H))
|
|
|
+ v.index = tf_kcov[time_col]
|
|
|
+ holidays_col.append(k + "_" + str(i))
|
|
|
+ if new_ts["known_cov_numeric"] is None:
|
|
|
+ new_ts["known_cov_numeric"] = pd.DataFrame(
|
|
|
+ v.rename(k + "_" + str(i)), index=v.index
|
|
|
+ )
|
|
|
+ else:
|
|
|
+ new_ts["known_cov_numeric"][k + "_" + str(i)] = v.rename(k).reindex(
|
|
|
+ new_ts["known_cov_numeric"].index
|
|
|
+ )
|
|
|
+
|
|
|
+ scaler = StandardScaler()
|
|
|
+ scaler.fit(new_ts["known_cov_numeric"][holidays_col])
|
|
|
+ new_ts["known_cov_numeric"][holidays_col] = scaler.transform(
|
|
|
+ new_ts["known_cov_numeric"][holidays_col]
|
|
|
+ )
|
|
|
+ return new_ts
|