Skip to content

Data Cleaning

#exports
import numpy as np
import pandas as pd

import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.model_selection import KFold
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor

import os
import glob
from ipypb import track

from batopt import utils, retrieval
from IPython.display import JSON


User Inputs

raw_data_dir = '../data/raw'
cache_data_dir = '../data/nb-cache'


Loading the Raw Data

We'll start by loading in the demand data, first we have to determine the latest training set that is available for us to work with

#exports
def identify_latest_set_num(data_dir):
    set_num = max([
        int(f.split('_set')[1].replace('.csv', '')) 
        for f in os.listdir(data_dir) 
        if 'set' in f
    ])

    return set_num
set_num = identify_latest_set_num(raw_data_dir)

set_num
4


We'll then load in and clean the datetime index of the dataset

#exports
def reindex_df_dt_idx(df, freq='30T'):
    full_dt_idx = pd.date_range(df.index.min(), df.index.max(), freq=freq)
    df = df.reindex(full_dt_idx)

    return df

def load_training_dataset(raw_data_dir: str, dataset_name: str='demand', set_num=None, parse_dt_idx: bool=True, dt_idx_freq: str='30T') -> pd.DataFrame:
    if set_num is None:
        set_num = identify_latest_set_num(raw_data_dir)

    allowed_datasets = ['demand', 'pv', 'weather']
    assert dataset_name in allowed_datasets, f"`dataset_name` must be one of: {', '.join(allowed_datasets)} - not {dataset_name}"

    df = pd.read_csv(glob.glob(f'{raw_data_dir}/{dataset_name}*set{set_num}.csv')[0].replace('\\', '/'))

    if parse_dt_idx == True:
        assert 'datetime' in df.columns, 'if `parse_dt_idx` is True then `datetime` must be a column in the dataset'

        df['datetime'] = pd.to_datetime(df['datetime'], utc=True)
        df = df.set_index('datetime').pipe(reindex_df_dt_idx, freq=dt_idx_freq).sort_index(axis=1)
        df.index.name = 'datetime'

    return df
df_demand = load_training_dataset(raw_data_dir, 'demand')

df_demand.head()
('Unnamed: 0_level_0', 'datetime') ('demand_MW', 'Unnamed: 1_level_1')
2017-11-03 00:00:00+00:00 2.19
2017-11-03 00:30:00+00:00 2.14
2017-11-03 01:00:00+00:00 2.01
2017-11-03 01:30:00+00:00 1.87
2017-11-03 02:00:00+00:00 1.86


Then the pv

df_pv = load_training_dataset(raw_data_dir, 'pv')

df_pv.head()
('Unnamed: 0_level_0', 'datetime') ('irradiance_Wm-2', 'Unnamed: 1_level_1') ('panel_temp_C', 'Unnamed: 2_level_1') ('pv_power_mw', 'Unnamed: 3_level_1')
2017-11-03 00:00:00+00:00 0 7.05 0
2017-11-03 00:30:00+00:00 0 7.38 0
2017-11-03 01:00:00+00:00 0 7.7 0
2017-11-03 01:30:00+00:00 0 7.48 0
2017-11-03 02:00:00+00:00 0 7.2 0


And finally the weather

df_weather = load_training_dataset(raw_data_dir, 'weather', dt_idx_freq='H')

df_weather.head(3)
('Unnamed: 0_level_0', 'datetime') ('solar_location1', 'Unnamed: 1_level_1') ('solar_location2', 'Unnamed: 2_level_1') ('solar_location3', 'Unnamed: 3_level_1') ('solar_location4', 'Unnamed: 4_level_1') ('solar_location5', 'Unnamed: 5_level_1') ('solar_location6', 'Unnamed: 6_level_1') ('temp_location1', 'Unnamed: 7_level_1') ('temp_location2', 'Unnamed: 8_level_1') ('temp_location3', 'Unnamed: 9_level_1') ('temp_location4', 'Unnamed: 10_level_1') ('temp_location5', 'Unnamed: 11_level_1') ('temp_location6', 'Unnamed: 12_level_1')
2015-01-01 00:00:00+00:00 0 0 0 0 0 0 9.75 9.65 8.83 7.58 11.62 11.22
2015-01-01 01:00:00+00:00 0 0 0 0 0 0 9.91 9.76 8.9 7.62 11.65 11.32
2015-01-01 02:00:00+00:00 0 0 0 0 0 0 9.99 9.8 9.1 7.61 11.65 11.3


We'll also create a function that reads all of the datasets in at once and then combines them

#exports
def combine_training_datasets(raw_data_dir, set_num=None):
    # Loading provided training datasets
    single_datasets = dict()
    dataset_names = ['demand', 'pv', 'weather']

    for dataset_name in dataset_names:
        single_datasets[dataset_name] = load_training_dataset(raw_data_dir, dataset_name, set_num=set_num)

    # Constructing date range
    min_dt = min([df.index.min() for df in single_datasets.values()])
    max_dt = max([df.index.max() for df in single_datasets.values()]) + pd.Timedelta(minutes=30)

    dt_rng = pd.date_range(min_dt, max_dt, freq='30T')

    # Constructing combined dataframe
    df_combined = pd.DataFrame(index=dt_rng, columns=dataset_names)

    for dataset_name in dataset_names:
        df_single_dataset = single_datasets[dataset_name]
        cols_to_be_overwritten = set(df_combined.columns) - (set(df_combined.columns) - set(df_single_dataset.columns))
        assert len(cols_to_be_overwritten) == 0, f"The following columns exist in multiple datasets meaning data would be overwritten: {', '.join(cols_to_be_overwritten)}"

        df_combined[df_single_dataset.columns] = df_single_dataset

    df_combined = df_combined.sort_index()

    # Adding holiday dates
    s_holidays = retrieval.load_holidays_s(raw_data_dir)

    s_cropped_holidays = s_holidays[max(df_combined.index.min(), s_holidays.index.min()):
                                    min(df_combined.index.max(), s_holidays.index.max())]

    df_combined.loc[s_cropped_holidays.index, 'holidays'] = s_cropped_holidays

    return df_combined
df_combined = combine_training_datasets(raw_data_dir)

df_combined.head(3)
demand pv weather demand_MW irradiance_Wm-2 panel_temp_C pv_power_mw solar_location1 solar_location2 solar_location3 solar_location4 solar_location5 solar_location6 temp_location1 temp_location2 temp_location3 temp_location4 temp_location5 temp_location6 holidays
2015-01-01 00:00:00+00:00 nan nan nan nan nan nan nan 0 0 0 0 0 0 9.75 9.65 8.83 7.58 11.62 11.22 nan
2015-01-01 00:30:00+00:00 nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan
2015-01-01 01:00:00+00:00 nan nan nan nan nan nan nan 0 0 0 0 0 0 9.91 9.76 8.9 7.62 11.65 11.32 nan


Identifying Missing Values

We'll quickly inspect the datasets and check their coverage over the full date range when aggregated by dataset

#exports
def identify_df_dt_entries(df_demand, df_pv, df_weather):
    min_dt = min(df_demand.index.min(), df_pv.index.min(), df_weather.index.min())
    max_dt = max(df_demand.index.max(), df_pv.index.max(), df_weather.index.max())

    dt_rng = pd.date_range(min_dt, max_dt, freq='30T')
    df_nulls = pd.DataFrame(index=dt_rng)

    df_nulls['demand'] = df_demand.reindex(dt_rng).isnull().mean(axis=1).astype(int)
    df_nulls['pv'] = df_pv.reindex(dt_rng).isnull().mean(axis=1).astype(int)
    df_nulls['weather'] = df_weather.reindex(dt_rng).ffill(limit=1).isnull().mean(axis=1).astype(int)

    df_entries = 1 - df_nulls

    return df_entries
df_entries = identify_df_dt_entries(df_demand, df_pv, df_weather)

# Plotting
fig, ax = plt.subplots(dpi=150)

sns.heatmap(df_entries.T, ax=ax, cmap=plt.cm.binary)

utils.set_date_ticks(ax, df_entries.index.min().strftime('%Y-%m-%d'), df_entries.index.max().strftime('%Y-%m-%d'), axis='x', freq='Qs', date_format='%b %y')
<AxesSubplot:>

png


We'll also determine the null percentage in each individual column

df_demand.isnull().mean()
demand_MW    0.0
dtype: float64


We can see that all of the PV data columns are missing some data

df_pv.isnull().mean()
irradiance_Wm-2    0.001863
panel_temp_C       0.001991
pv_power_mw        0.000771
dtype: float64


Locations 1 and 2 are also missing some solar data, with 4 missing temperature data

df_weather.isnull().mean()
solar_location1    0.001487
solar_location2    0.000992
solar_location3    0.000000
solar_location4    0.000000
solar_location5    0.000000
solar_location6    0.000000
temp_location1     0.000000
temp_location2     0.000000
temp_location3     0.000000
temp_location4     0.000992
temp_location5     0.000000
temp_location6     0.000000
dtype: float64


Handling Missing Values

We'll start by interpolating the missing PV data, first checking the number of variables that have null values for each time period

s_pv_num_null_vals = df_pv.isnull().sum(axis=1).replace(0, np.nan).dropna().astype(int)

s_pv_num_null_vals.value_counts()
1    103
3     24
dtype: int64


pv_power_mw and irradiance_Wm-2 have the same average number of null values, there are also no time-periods where only 2 variables have null values - it's therefore likely that power and irradiance always have null periods at the same time which makes it harder to interpolate their values. We'll quickly check this hypothesis:

(df_pv['pv_power_mw'].isnull() == df_pv['irradiance_Wm-2'].isnull()).mean() == 1
False


It appears as though the pv_power_mw and irradiance_Wm-2 missing values are a single time-block that coincides with a larger set of missing values within panel_temp_C.

df_pv[df_pv['pv_power_mw'].isnull()]
('Unnamed: 0_level_0', 'datetime') ('irradiance_Wm-2', 'Unnamed: 1_level_1') ('panel_temp_C', 'Unnamed: 2_level_1') ('pv_power_mw', 'Unnamed: 3_level_1')
2018-03-04 07:00:00+00:00 nan nan nan
2018-03-04 07:30:00+00:00 nan nan nan
2018-03-04 08:00:00+00:00 nan nan nan
2018-03-04 08:30:00+00:00 nan nan nan
2018-03-04 09:00:00+00:00 nan nan nan
2018-03-04 09:30:00+00:00 nan nan nan
2018-03-04 10:00:00+00:00 nan nan nan
2018-03-04 10:30:00+00:00 nan nan nan
2018-03-04 11:00:00+00:00 nan nan nan
2018-03-04 11:30:00+00:00 nan nan nan
2018-03-04 12:00:00+00:00 nan nan nan
2018-03-04 12:30:00+00:00 nan nan nan
2018-03-04 13:00:00+00:00 nan nan nan
2018-03-04 13:30:00+00:00 nan nan nan
2018-03-04 14:00:00+00:00 nan nan nan
2018-03-04 15:00:00+00:00 nan nan nan
2018-03-04 15:30:00+00:00 nan nan nan
2018-03-04 16:00:00+00:00 nan nan nan
2018-03-04 16:30:00+00:00 nan nan nan
2018-03-04 17:00:00+00:00 nan nan nan
2019-07-19 14:00:00+00:00 nan nan nan
2019-07-19 14:30:00+00:00 nan nan nan
2019-07-19 15:00:00+00:00 nan nan nan
2019-07-19 15:30:00+00:00 nan nan nan


Looking at the panel_temp_C data we can see there are 3 time-blocks where obervations are missing

df_pv['panel_temp_C'].isnull().astype(int).plot()
<AxesSubplot:xlabel='datetime'>

png


One option might to be replace the missing temperature values with the temperatures observed at the surrounding weather grid locations, we'll start by constructing a dataframe that includes all of the temperature data as well as the average rolling temperature for each weather data location.

#exports
def construct_df_temp_features(df_weather, df_pv):
    df_weather = df_weather.reindex(pd.date_range(df_weather.index.min(), df_weather.index.max(), freq='30T')).ffill(limit=1)
    temp_loc_cols = df_weather.columns[df_weather.columns.str.contains('temp')]

    df_temp_features = (df_weather
                        .copy()
                        [temp_loc_cols]
                        .assign(site_temp=df_pv['panel_temp_C'])
                       )

    df_temp_features[[col+'_rolling' for col in temp_loc_cols]] = df_temp_features.rolling(3).mean()[temp_loc_cols]

    df_temp_features = df_temp_features.sort_index(axis=1)

    return df_temp_features
df_temp_features = construct_df_temp_features(df_weather, df_pv).dropna()

df_temp_features.head()
site_temp temp_location1 temp_location1_rolling temp_location2 temp_location2_rolling temp_location3 temp_location3_rolling temp_location4 temp_location4_rolling temp_location5 temp_location5_rolling temp_location6 temp_location6_rolling
2017-11-03 00:00:00+00:00 7.05 8.56 8.62667 9.64 9.66 7.46 7.78667 6.68 6.93333 13.09 13.0233 13.2 13.1133
2017-11-03 00:30:00+00:00 7.38 8.56 8.59333 9.64 9.65 7.46 7.62333 6.68 6.80667 13.09 13.0567 13.2 13.1567
2017-11-03 01:00:00+00:00 7.7 8.69 8.60333 9.71 9.66333 7.14 7.35333 6.27 6.54333 13.21 13.13 13.32 13.24
2017-11-03 01:30:00+00:00 7.48 8.69 8.64667 9.71 9.68667 7.14 7.24667 6.27 6.40667 13.21 13.17 13.32 13.28
2017-11-03 02:00:00+00:00 7.2 8.74 8.70667 9.73 9.71667 6.86 7.04667 5.91 6.15 13.3 13.24 13.36 13.3333


We'll now check the correlation

sns.heatmap(df_temp_features.corr())
<AxesSubplot:>

png


The correlation drops off quickly when it gets to the site temperature, looking at the full distributions we can see that the site measurements get far higher. This is because the panel is absorbing heat that raises its temperature above that of the surrounding area, again making it more difficult to simply fill in with the nearby temperature measurements.

sns.histplot(df_temp_features['site_temp'], color='C0', label='Panel')
sns.histplot(df_temp_features.drop('site_temp', axis=1).min(axis=1), color='C1', label='MERRA Min')
sns.histplot(df_temp_features.drop('site_temp', axis=1).max(axis=1), color='C2', label='MERRA Max')

plt.legend(frameon=False)
C:\Users\Ayrto\anaconda3\envs\batopt\lib\site-packages\seaborn\distributions.py:2557: FutureWarning: `distplot` is a deprecated function and will be removed in a future version. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms).
  warnings.warn(msg, FutureWarning)
C:\Users\Ayrto\anaconda3\envs\batopt\lib\site-packages\seaborn\distributions.py:2557: FutureWarning: `distplot` is a deprecated function and will be removed in a future version. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms).
  warnings.warn(msg, FutureWarning)
C:\Users\Ayrto\anaconda3\envs\batopt\lib\site-packages\seaborn\distributions.py:2557: FutureWarning: `distplot` is a deprecated function and will be removed in a future version. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms).
  warnings.warn(msg, FutureWarning)





<matplotlib.legend.Legend at 0x1fa8d979310>

png

# Could use an RF to estimate the panel temp based on the weather grid temps?
# Potential features: current average surrounding temp, average surrounding temp over the last 3 hours
#exports
def split_X_y_data(df, target_col='site_temp'):
    df = df.dropna()
    X_cols = df.drop(target_col, axis=1).columns

    X = df[X_cols].values
    y = df[target_col].values

    return X, y

def split_X_y_data_with_index(df, target_col='site_temp'):
    df = df.dropna()
    X_cols = df.drop(target_col, axis=1).columns

    X = df[X_cols].values
    y = df[target_col].values
    index = df.index

    return X, y, index
X, y = split_X_y_data(df_temp_features)

X.shape, y.shape
((37024, 12), (37024,))
#exports
def generate_kfold_preds(
    X, 
    y, 
    model=LinearRegression(), 
    kfold_kwargs={'n_splits': 5, 'shuffle': True},
    index=None
):

    kfold = KFold(**kfold_kwargs)
    df_pred = pd.DataFrame(columns=['pred', 'true'], index=np.arange(X.shape[0]))

    for train_idxs, test_idxs in kfold.split(X):
        X_train, y_train = X[train_idxs], y[train_idxs]
        X_test, y_test = X[test_idxs], y[test_idxs]

        model.fit(X_train, y_train)

        df_pred.loc[test_idxs, 'true'] = y_test
        df_pred.loc[test_idxs, 'pred'] = model.predict(X_test)

    df_pred = df_pred.sort_index()

    if index is not None:
        assert len(index) == df_pred.shape[0], 'The passed index must be the same length as X and y'
        df_pred.index = index

    return df_pred
df_pred = generate_kfold_preds(X, y)

df_pred.head()
pred true
0 4.75986 7.05
1 5.00606 7.38
2 5.40829 7.7
3 5.49439 7.48
4 5.27108 7.2
#exports
def evaluate_models(X, y, models, post_pred_proc_func=None, index=None):
    model_scores = dict()

    for model_name, model in track(models.items()):
        df_pred = generate_kfold_preds(X, y, model, index=index)

        if post_pred_proc_func is not None:
            df_pred['pred'] = post_pred_proc_func(df_pred['pred'])

        model_scores[model_name] = {
            'mae': mean_absolute_error(df_pred['true'], df_pred['pred']),
            'rmse': np.sqrt(mean_squared_error(df_pred['true'], df_pred['pred']))
        }

    df_model_scores = pd.DataFrame(model_scores)

    df_model_scores.index.name = 'metric'
    df_model_scores.columns.name = 'model'

    return df_model_scores
models = {
    'std_linear': LinearRegression(),
    'random_forest': RandomForestRegressor(),
    'boosted': GradientBoostingRegressor()
}

rerun_panel_temp_model = False
model_scores_filename = 'panel_temp_interp_model_results.csv'

if (rerun_panel_temp_model == True) or (model_scores_filename not in os.listdir(cache_data_dir)):
    df_model_scores = evaluate_models(X, y, models)
    df_model_scores.to_csv(f'{cache_data_dir}/{model_scores_filename}')
else:
    df_model_scores = pd.read_csv(f'{cache_data_dir}/{model_scores_filename}', index_col='metric')

df_model_scores
('Unnamed: 0_level_0', 'metric') ('std_linear', 'Unnamed: 1_level_1') ('random_forest', 'Unnamed: 2_level_1') ('boosted', 'Unnamed: 3_level_1')
mae 2.81922 1.68451 2.58143
rmse 3.78674 2.69334 3.73415
top_model = df_model_scores.T['rmse'].idxmin()
df_pred = generate_kfold_preds(X, y, models[top_model])

df_pred.head()
pred true
0 6.0645 7.05
1 6.1691 7.38
2 7.5151 7.7
3 7.7232 7.48
4 7.1748 7.2
s_residuals = df_pred.diff(1, axis=1).dropna(axis=1).iloc[:, 0]

s_residuals.plot(linewidth=0.3)
<AxesSubplot:>

png

plt.scatter(df_pred['true'], df_pred['pred'], s=1)

plt.xlabel('Obervation')
plt.ylabel('Prediction')
Text(0, 0.5, 'Prediction')

png

#exports
def interpolate_missing_panel_temps(df_pv, df_weather, model=RandomForestRegressor()):
    missing_panel_temp_dts = df_pv.index[df_pv['panel_temp_C'].isnull()]

    if len(missing_panel_temp_dts) == 0: # i.e. no missing values
        return df_pv

    df_temp_features = construct_df_temp_features(df_weather, df_pv)
    missing_dt_X = df_temp_features.loc[missing_panel_temp_dts].drop('site_temp', axis=1).values
    X, y = split_X_y_data(df_temp_features, 'site_temp')

    model.fit(X, y)
    df_pv.loc[missing_panel_temp_dts, 'panel_temp_C'] = model.predict(missing_dt_X)

    assert df_pv['panel_temp_C'].isnull().sum() == 0, 'There are still null values for the PV panel temperature'

    return df_pv
df_pv = interpolate_missing_panel_temps(df_pv, df_weather)

df_pv.isnull().mean()
irradiance_Wm-2    0.002016
panel_temp_C       0.000000
pv_power_mw        0.000645
dtype: float64
#exports
def construct_df_irradiance_features(df_weather, df_pv):
    df_weather = df_weather.reindex(pd.date_range(df_weather.index.min(), df_weather.index.max(), freq='30T')).ffill(limit=1)
    temp_loc_cols = df_weather.columns[df_weather.columns.str.contains('solar')]

    df_irradiance_features = (df_weather
                              .copy()
                              [temp_loc_cols]
                              .assign(site_solar=df_pv['irradiance_Wm-2'])
                              .pipe(lambda df: df.assign(hour=df.index.hour + df.index.minute/60))
                             )

    df_irradiance_features = df_irradiance_features.sort_index(axis=1)

    return df_irradiance_features
df_irradiance_features = construct_df_irradiance_features(df_weather, df_pv)

df_irradiance_features.head()
hour site_solar solar_location1 solar_location2 solar_location3 solar_location4 solar_location5 solar_location6
2015-01-01 00:00:00+00:00 0 nan 0 0 0 0 0 0
2015-01-01 00:30:00+00:00 0.5 nan 0 0 0 0 0 0
2015-01-01 01:00:00+00:00 1 nan 0 0 0 0 0 0
2015-01-01 01:30:00+00:00 1.5 nan 0 0 0 0 0 0
2015-01-01 02:00:00+00:00 2 nan 0 0 0 0 0 0
models = {
    'std_linear': LinearRegression(),
    'random_forest': RandomForestRegressor(),
    'boosted': GradientBoostingRegressor()
}

rerun_site_irradiance_model = False
model_scores_filename = 'site_irradiance_interp_model_results.csv'

X, y = split_X_y_data(df_irradiance_features, 'site_solar')

if (rerun_site_irradiance_model == True) or (model_scores_filename not in os.listdir(cache_data_dir)):
    df_model_scores = evaluate_models(X, y, models)
    df_model_scores.to_csv(f'{cache_data_dir}/{model_scores_filename}')
else:
    df_model_scores = pd.read_csv(f'{cache_data_dir}/{model_scores_filename}', index_col='metric')

df_model_scores
('Unnamed: 0_level_0', 'metric') ('std_linear', 'Unnamed: 1_level_1') ('random_forest', 'Unnamed: 2_level_1') ('boosted', 'Unnamed: 3_level_1')
mae 57.4977 37.5087 49.6956
rmse 110.546 78.8525 99.1964
top_model = df_model_scores.T['rmse'].idxmin()
df_pred = generate_kfold_preds(X, y, models[top_model])

df_pred.head()
pred true
0 0.000657 0
1 0.00044 0
2 0.000239 0
3 0.001181 0
4 0.001369 0
plt.scatter(df_pred['true'], df_pred['pred'], s=1)

plt.xlabel('Obervation')
plt.ylabel('Prediction')
Text(0, 0.5, 'Prediction')

png

#exports
def interpolate_missing_site_irradiance(df_pv, df_weather, model=RandomForestRegressor()):
    missing_site_irradiance_dts = df_pv.index[df_pv['irradiance_Wm-2'].isnull()]

    if len(missing_site_irradiance_dts) == 0: # i.e. no missing values
        return df_pv

    df_irradiance_features = construct_df_irradiance_features(df_weather, df_pv)
    missing_dt_X = df_irradiance_features.loc[missing_site_irradiance_dts].drop('site_solar', axis=1).values
    X, y = split_X_y_data(df_irradiance_features, 'site_solar')

    model.fit(X, y)
    df_pv.loc[missing_site_irradiance_dts, 'irradiance_Wm-2'] = model.predict(missing_dt_X)

    assert df_pv['irradiance_Wm-2'].isnull().sum() == 0, 'There are still null values for the solar site irradiance'

    return df_pv
df_pv = interpolate_missing_site_irradiance(df_pv, df_weather)

df_pv.isnull().mean()
irradiance_Wm-2    0.000000
panel_temp_C       0.000000
pv_power_mw        0.000645
dtype: float64


Now that we have the irradiance and temperature we're ready to start filling in the missing values for power output, again using the same regression interpolation method

#exports
def construct_df_power_features(df_pv):
    df_power_features = (df_pv
                         .pipe(lambda df: df.assign(hour=df.index.hour + df.index.minute/60))
                         .sort_index(axis=1)
                        )

    return df_power_features
df_power_features = construct_df_power_features(df_pv)

df_power_features.head()
('Unnamed: 0_level_0', 'datetime') ('hour', 'Unnamed: 1_level_1') ('irradiance_Wm-2', 'Unnamed: 2_level_1') ('panel_temp_C', 'Unnamed: 3_level_1') ('pv_power_mw', 'Unnamed: 4_level_1')
2017-11-03 00:00:00+00:00 0 0 7.05 0
2017-11-03 00:30:00+00:00 0.5 0 7.38 0
2017-11-03 01:00:00+00:00 1 0 7.7 0
2017-11-03 01:30:00+00:00 1.5 0 7.48 0
2017-11-03 02:00:00+00:00 2 0 7.2 0
models = {
    'std_linear': LinearRegression(),
    'random_forest': RandomForestRegressor(),
    'boosted': GradientBoostingRegressor()
}

rerun_site_power_model = False
model_scores_filename = 'site_power_interp_model_results.csv'

X, y, dates = split_X_y_data_with_index(df_power_features, 'pv_power_mw')

if (rerun_site_power_model == True) or (model_scores_filename not in os.listdir(cache_data_dir)):
    df_model_scores = evaluate_models(X, y, models)
    df_model_scores.to_csv(f'{cache_data_dir}/{model_scores_filename}')
else:
    df_model_scores = pd.read_csv(f'{cache_data_dir}/{model_scores_filename}', index_col='metric')

df_model_scores
('Unnamed: 0_level_0', 'metric') ('std_linear', 'Unnamed: 1_level_1') ('random_forest', 'Unnamed: 2_level_1') ('boosted', 'Unnamed: 3_level_1')
mae 0.061519 0.041122 0.043927
rmse 0.14598 0.135822 0.133212
top_model = df_model_scores.T['rmse'].idxmin()
df_pred = generate_kfold_preds(X, y, models[top_model])

df_pred.head()
pred true
0 -0.000829 0
1 -0.000634 0
2 -0.000178 0
3 0.000163 0
4 -0.001097 0
plt.scatter(df_pred['true'], df_pred['pred'], s=1)

plt.xlabel('Obervation')
plt.ylabel('Prediction')
Text(0, 0.5, 'Prediction')

png

Anomalous data points in PV data

The PV data shows a number of points where the observed value is 0 but the prediction is much higher.

First let's try and identify them (setting the tolerance to be lower will capture more values as anomalous).

def identify_anomalies_pv(df_pred, tolerance=0.1):
    foo = df_pred.copy()
    foo['difference'] = foo.pred - foo.true
    foo = foo[(foo.difference > tolerance) & (foo.true == 0)]
    return foo.index

anomalous_dates = dates[identify_anomalies_pv(df_pred)]
anomalous_df = df_power_features[df_power_features.index.isin(anomalous_dates)]
plt.hist(anomalous_df.hour) # Check this histogram to eyeball if any unreasonable anomalous values are caught by the tolerance (e.g. late at night)
(array([ 3.,  9., 24., 22., 34., 22., 30., 12.,  9.,  3.]),
 array([ 6.  ,  7.25,  8.5 ,  9.75, 11.  , 12.25, 13.5 , 14.75, 16.  ,
        17.25, 18.5 ]),
 <BarContainer object of 10 artists>)

png

Replace these values in df_power_features.

df_power_features_clean = df_power_features.copy()
df_power_features_clean.loc[df_power_features_clean.index.isin(anomalous_dates), 'pv_power_mw'] = np.nan

Rerun the previous model fitting and check the pred vs. actual graph.

models = {
    'std_linear': LinearRegression(),
    'random_forest': RandomForestRegressor(),
    'boosted': GradientBoostingRegressor()
}

rerun_site_power_model = False
model_scores_filename = 'site_power_interp_clean_model_results.csv'

X, y, dates = split_X_y_data_with_index(df_power_features_clean, 'pv_power_mw')

if (rerun_site_power_model == True) or (model_scores_filename not in os.listdir(cache_data_dir)):
    df_model_scores = evaluate_models(X, y, models)
    df_model_scores.to_csv(f'{cache_data_dir}/{model_scores_filename}')
else:
    df_model_scores = pd.read_csv(f'{cache_data_dir}/{model_scores_filename}', index_col='metric')

top_model = df_model_scores.T['rmse'].idxmin()
df_pred = generate_kfold_preds(X, y, models[top_model])

plt.scatter(df_pred['true'], df_pred['pred'], s=1)

plt.xlabel('Obervation')
plt.ylabel('Prediction')
Text(0, 0.5, 'Prediction')

png

The above graph looks to be a cleaner with tolerance at 0.1. It looks like there might still be some which aren't though. Consider lowering the tolerance.

#exports
def pv_anomalies_to_nan(df_pv, model=GradientBoostingRegressor(), tolerance=0.1):
    """
    Run this function to identify places where predicted values for pv_power_mw are much larger
    than true values and where the true value is 0 (we expect these are anomalies) and make these values nan.

    """
    df_power_features = construct_df_power_features(df_pv)

    X, y, dates = split_X_y_data_with_index(df_power_features, 'pv_power_mw')
    df_pred = generate_kfold_preds(X, y, model)
    df_pred['difference'] = df_pred.pred - df_pred.true
    df_pred['datetime'] = dates
    df_pred = df_pred.set_index('datetime')

    anomalous_idx = df_pred[(df_pred.difference > tolerance) & (df_pred.true == 0)].index    

    df_pv.loc[df_pv.index.isin(anomalous_idx), 'pv_power_mw'] = np.nan

    return df_pv
df_pv = pv_anomalies_to_nan(df_pv)
#exports
def interpolate_missing_site_power(df_pv, model=RandomForestRegressor()):
    missing_site_power_dts = df_pv.index[df_pv['pv_power_mw'].isnull()]

    if len(missing_site_power_dts) == 0: # i.e. no missing values
        return df_pv

    df_power_features = construct_df_power_features(df_pv)
    missing_dt_X = df_power_features.loc[missing_site_power_dts].drop('pv_power_mw', axis=1).values
    X, y = split_X_y_data(df_power_features, 'pv_power_mw')

    model.fit(X, y)
    df_pv.loc[missing_site_power_dts, 'pv_power_mw'] = model.predict(missing_dt_X)

    assert df_pv['pv_power_mw'].isnull().sum() == 0, 'There are still null values for the solar site power'

    return df_pv
df_pv = interpolate_missing_site_power(df_pv)

df_pv.isnull().mean()
irradiance_Wm-2    0.0
panel_temp_C       0.0
pv_power_mw        0.0
dtype: float64
#exports
def interpolate_missing_weather_solar(df_pv, df_weather, weather_col='solar_location2', model=RandomForestRegressor()):
    missing_weather_solar_dts = df_weather.index[df_weather[weather_col].isnull()]

    if len(missing_weather_solar_dts) == 0: # i.e. no missing values
        return df_pv

    df_irradiance_features = construct_df_irradiance_features(df_weather, df_pv).drop('site_solar', axis=1)
    missing_dt_X = df_irradiance_features.loc[missing_weather_solar_dts].drop(weather_col, axis=1).values
    X, y = split_X_y_data(df_irradiance_features, weather_col)

    model.fit(X, y)
    df_weather.loc[missing_weather_solar_dts, weather_col] = model.predict(missing_dt_X)

    assert df_weather[weather_col].isnull().sum() == 0, 'There are still null values for the weather dataset solar observations'

    return df_weather
df_weather = interpolate_missing_weather_solar(df_pv, df_weather, model=LinearRegression())

df_weather.isnull().mean()
solar_location1    0.0000
solar_location2    0.0000
solar_location3    0.0000
solar_location4    0.0000
solar_location5    0.0000
solar_location6    0.0000
temp_location1     0.0000
temp_location2     0.0000
temp_location3     0.0000
temp_location4     0.0011
temp_location5     0.0000
temp_location6     0.0000
dtype: float64


Interpolate Missing Temp Data

foo = interpolate_missing_temps(df_weather, 'temp_location4', model=LinearRegression())
foo.isnull().mean()
solar_location1    0.0
solar_location2    0.0
solar_location3    0.0
solar_location4    0.0
solar_location5    0.0
solar_location6    0.0
temp_location1     0.0
temp_location2     0.0
temp_location3     0.0
temp_location4     0.0
temp_location5     0.0
temp_location6     0.0
dtype: float64


Finally we'll export the relevant code to our batopt module