Commit fce6591d authored by Luis Salamanca's avatar Luis Salamanca
Browse files

Template and notebooks

parent 506ec486
Pipeline #76 passed with stage
in 4 minutes and 2 seconds
%% Cell type:code id: tags:
``` python
import base64
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import scipy.stats as stat
sns.set()
```
%% Cell type:markdown id: tags:
# Preparation
This code is derived from the [Crowd AI challenge starter kit](https://github.com/crowdAI/ieee_investment_ranking_challenge-starter-kit). This notebook demonstrates reading in the data and writing out some features to process.
## Read Challenge Data and the Prediction Template
%% Cell type:code id: tags:parameters
``` python
dataset_file_path = "../data/invest/full_dataset.csv"
features_pickle_file_path = '../data/outputs/features.pkl'
```
%% Cell type:code id: tags:
``` python
df = pd.read_csv(dataset_file_path)
```
%% Cell type:markdown id: tags:
## Inspect the data / EDA
%% Cell type:code id: tags:
``` python
# How many time periods are there?
time_periods = pd.unique(df['time_period'])
len(time_periods)
```
%% Cell type:code id: tags:
``` python
# Let's take a look at the distribution of returns
# in a (non-random) sampling of time periods
fig, axs = plt.subplots(2, 3, sharex=True, sharey=True)
fig.suptitle("Distribution of 6M Forward Returns")
ts = time_periods[::7] # extract 6 periods to look at
sample_df = df[df['time_period'].isin(ts)]
for idx, (name, period_df) in enumerate(sample_df.groupby('time_period')):
ax = axs[idx//3][idx%3]
sns.distplot(period_df['Norm_Ret_F6M'].dropna(), ax=ax, axlabel="")
ax.set_title("period|{}".format(name))
```
%% Cell type:markdown id: tags:
## Recompute the `Rank_F6M`
%% Cell type:code id: tags:
``` python
for time in time_periods:
returns = df.loc[(df['time_period'] == time) & (df['Train'] == 1),'Norm_Ret_F6M']
rank = len(returns) - stat.rankdata(returns,method='ordinal').astype(int) + 1
df.loc[(df['time_period'] == time) & (df['Train'] == 1),'Rank_F6M'] = rank
```
%% Cell type:markdown id: tags:
## Example Feature Engineering
Each of the 71 variables is broken up into **6 non-overlapping observations** in each time period. For example `X1` has six monthly observations in each period represented as `X1_1`, `X1_2`,...,`X1_6`
To make it easier to model, we will average the 6 observations within each `time_period`.
%% Cell type:code id: tags:
``` python
# Create a new frame that contains averages over the observations
# and percentile ranks for each of the averaged columns
model_columns = ['time_period', 'index', 'Train', 'Norm_Ret_F6M', 'Rank_F6M']
model_df = pd.DataFrame(df[model_columns])
variable_list = ["X" + str(i) + '_' for i in range(1,71)]
for var in variable_list:
var_avg = df.filter(regex=(var)).mean(axis=1)
model_df[var + 'avg'] = var_avg
model_df[var + 'avg' + '_pctile'] = stat.rankdata(var_avg)/len(var_avg)
model_df.head()
```
%% Cell type:markdown id: tags:
# Write out features
%% Cell type:code id: tags:
``` python
model_df.to_pickle(features_pickle_file_path)
```
%% Cell type:code id: tags:
``` python
import base64
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import scipy.stats as stat
from sklearn.ensemble import RandomForestRegressor
sns.set()
pd.options.mode.chained_assignment = None
```
%% Cell type:markdown id: tags:
# Model / prediction
This code is derived from the [Crowd AI challenge starter kit](https://github.com/crowdAI/ieee_investment_ranking_challenge-starter-kit). This notebook demonstrates reading in the data and writing out some features to process.
## Read in the features, build a model and predict the results
%% Cell type:code id: tags:parameters
``` python
features_pickle_file_path = "../data/outputs/features.pkl"
pred_template_file_path = "../templates/prediction_template.csv"
pred_output_file_path = "../data/outputs/predictions.csv"
```
%% Cell type:code id: tags:
``` python
model_df = pd.read_pickle(features_pickle_file_path)
pred_template_df = pd.read_csv(pred_template_file_path)
```
%% Cell type:markdown id: tags:
## Create function for imputation and Random Forest
%% Cell type:code id: tags:
``` python
time_periods, time_periods_index = np.unique(model_df['time_period'], return_index=True)
def randomForest(train_start_period, prediction_period):
train_window_start = time_periods_index[time_periods == train_start_period][0]
train_window_end = time_periods_index[time_periods == prediction_period][0]
rf_model_data = model_df.iloc[range(train_window_start,train_window_end),:]
rf_model_data.fillna(0,inplace = True)
rf = RandomForestRegressor(n_estimators=1, verbose=2, oob_score=True, max_features=10)
# fit using training data only (Train == 1)
rf.fit(rf_model_data.loc[rf_model_data['Train'] == 1,'X1_avg':'X70_avg_pctile'], rf_model_data.loc[rf_model_data['Train'] == 1,'Norm_Ret_F6M'])
return rf
```
%% Cell type:markdown id: tags:
## Create function for calculating spearman correlation and Normalized Discounted Cumulative Gain
%% Cell type:code id: tags:
``` python
def calc_metrics(time_period, predicted_rank):
#subset actual values for prediction time_period
actuals = pd.DataFrame(model_df.loc[(model_df['time_period'] == time_period) & (model_df['Train'] == 1),:], copy=True)
#join predictions onto actuals
actuals.loc[:, 'Rank_F6M_pred'] = predicted_rank
#calculate spearman correlation
spearman = stat.spearmanr(actuals['Rank_F6M'],actuals['Rank_F6M_pred'])[0]
# calculate NDCG = DCG of Top 20% / Ideal DCG of Top 20%
# subset top 20% predictions
t20 = actuals.loc[actuals['Rank_F6M_pred'] <= np.nanpercentile(actuals['Rank_F6M_pred'],20),:]
t20.loc[:, 'discount'] = np.amax(actuals['Rank_F6M_pred'])/(np.amax(actuals['Rank_F6M_pred'])+actuals['Rank_F6M_pred'])
t20.loc[:, 'gain'] = t20['Norm_Ret_F6M'] * t20['discount']
DCG = np.sum(t20['gain'])
#subset top 20% actuals
i20 = actuals.loc[actuals['Rank_F6M'] <= np.nanpercentile(actuals['Rank_F6M'],20),:]
i20.loc[:, 'discount'] = np.amax(actuals['Rank_F6M'])/(np.amax(actuals['Rank_F6M'])+actuals['Rank_F6M'])
i20.loc[:, 'gain'] = i20['Norm_Ret_F6M']*i20['discount']
IDCG = np.sum(i20['gain'])
NDCG = DCG/IDCG
# return time_period, spearman correlation, NDCG
return(pd.DataFrame([(time_period,spearman,NDCG)],columns = ['time_period','spearman','NDCG']))
```
%% Cell type:markdown id: tags:
## Random forest model on warm-up period
%% Cell type:code id: tags:
``` python
time = '2002_1'
rf = randomForest(train_start_period = '1996_2', prediction_period = time)
```
%% Cell type:code id: tags:
``` python
model_df.fillna(0,inplace = True)
predictions = rf.predict(model_df.loc[model_df['time_period'] == time,'X1_avg':'X70_avg_pctile'])
# view prediction histogram
pred_hist = sns.distplot(predictions)
plt.show()
# print Out-of-bag R^2
print(rf.oob_score_)
```
%% Cell type:markdown id: tags:
## Expanding window procedure
Expanding window = Train model up to time `t` to predict on all observations at time `t+1`
* Show results on training data
* Add test predictions to prediction template
%% Cell type:code id: tags:
``` python
model_df.fillna(0, inplace=True)
train_results = pd.DataFrame(columns=['time_period','spearman','NDCG'])
for time in time_periods[11:]:
rf = randomForest(train_start_period='1996_2', prediction_period = time)
if(time != '2017_1'):
train_predictions = rf.predict(model_df.loc[(model_df['time_period'] == time) & (model_df['Train'] == 1),'X1_avg':'X70_avg_pctile'])
train_rank_predictions = len(train_predictions) - stat.rankdata(train_predictions,method='ordinal').astype(int) + 1
train_results = train_results.append(calc_metrics(time_period = time, predicted_rank = train_rank_predictions))
test_predictions = rf.predict(model_df.loc[(model_df['time_period'] == time) & (model_df['Train'] == 0),'X1_avg':'X70_avg_pctile'])
test_rank_predictions = len(test_predictions) - stat.rankdata(test_predictions,method='ordinal').astype(int) + 1
pred_template_df.loc[pred_template_df['time_period'] == time,'Rank_F6M'] = test_rank_predictions
print("Time period " + time + " completed.")
print(train_results)
```
%% Cell type:code id: tags:
``` python
pred_template_df.to_csv(pred_output_file_path)
```
%% Cell type:code id: tags:
``` python
```
This diff is collapsed.
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment