Data Retrieval¶
#exports
import json
import numpy as np
import pandas as pd
import requests
import FEAutils as hlp
import matplotlib.pyplot as plt
from IPython.display import JSON
User Inputs¶
raw_data_dir = '../data/raw'
intermediate_data_dir = '../data/intermediate'
Public Holidays¶
We'll start by retrieving a JSON for public holidays available from www.gov.uk.
get_holidays_json = lambda holidays_url='https://www.gov.uk/bank-holidays.json': requests.get(holidays_url).json()
holidays_json = get_holidays_json()
JSON(holidays_json)
<IPython.core.display.JSON object>
We'll quickly save this data
#exports
def save_latest_raw_holiday_data(raw_data_dir, holidays_url='https://www.gov.uk/bank-holidays.json'):
holidays_json = get_holidays_json(holidays_url)
with open(f'{raw_data_dir}/holidays.json', 'w') as fp:
json.dump(holidays_json, fp)
return
save_latest_raw_holiday_data(intermediate_data_dir)
We'll then convert it into a dataframe
#exports
def load_holidays_df(raw_data_dir):
with open(f'{raw_data_dir}/holidays.json', 'r') as fp:
holidays_json = json.load(fp)
df_holidays = pd.DataFrame(holidays_json['england-and-wales']['events'])
df_holidays['date'] = pd.to_datetime(df_holidays['date'])
return df_holidays
df_holidays = load_holidays_df(raw_data_dir)
df_holidays.head()
title | date | notes | bunting | |
---|---|---|---|---|
0 | New Year’s Day | 2016-01-01 | nan | True |
1 | Good Friday | 2016-03-25 | nan | False |
2 | Easter Monday | 2016-03-28 | nan | True |
3 | Early May bank holiday | 2016-05-02 | nan | True |
4 | Spring bank holiday | 2016-05-30 | nan | True |
We'll now create a half-hourly time-series where the prescence of a public holiday is given a value of 1
#exports
def holidays_df_to_s(df_holidays):
holidays_dt_range = pd.date_range(df_holidays['date'].min(), df_holidays['date'].max(), freq='30T', tz='UTC')
s_holidays = pd.Series(np.isin(holidays_dt_range.date, df_holidays['date'].dt.date), index=holidays_dt_range).astype(int)
s_holidays.index.name = 'datetime'
s_holidays.name = 'holiday'
return s_holidays
s_holidays = holidays_df_to_s(df_holidays)
s_holidays.head()
datetime
2016-01-01 00:00:00+00:00 1
2016-01-01 00:30:00+00:00 1
2016-01-01 01:00:00+00:00 1
2016-01-01 01:30:00+00:00 1
2016-01-01 02:00:00+00:00 1
Freq: 30T, Name: holiday, dtype: int32
We'll quickly plot the results
fig, ax = plt.subplots(dpi=150)
s_holidays['2016'].plot()
hlp.hide_spines(ax, positions=['top', 'bottom', 'left', 'right'])
ax.set_yticks([])
ax.set_ylim(0.1, 0.9)
(0.1, 0.9)
We'll create a wrapper for combining these steps
#exports
def load_holidays_s(raw_data_dir):
df_holidays = load_holidays_df(raw_data_dir)
s_holidays = holidays_df_to_s(df_holidays)
return s_holidays
s_holidays = load_holidays_s(raw_data_dir)
s_holidays.head()
datetime
2016-01-01 00:00:00+00:00 1
2016-01-01 00:30:00+00:00 1
2016-01-01 01:00:00+00:00 1
2016-01-01 01:30:00+00:00 1
2016-01-01 02:00:00+00:00 1
Freq: 30T, Name: holiday, dtype: int32
And also save the data to a csv
s_holidays.to_csv(f'{intermediate_data_dir}/holidays.csv')
Finally we'll export the relevant code to our batopt
module