Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
311 changes: 311 additions & 0 deletions Homework2.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,311 @@
import pandas as pd
import numpy as np
from pandas.tseries.offsets import MonthEnd, QuarterEnd
import scipy.stats as stats
import locale

# Attempt to use English locale if available
try:
locale.setlocale(locale.LC_ALL, 'en_US.UTF-8')
except locale.Error:
pass

# ----------------------------
# Risk free rate
# ----------------------------
# Paths here follow the original R script. Adjust to your local layout.
riskfree = pd.read_excel('Input/Market_Return/TRD_Nrrate2023.xlsx', sheet_name=0, skiprows=2)

riskfree_day = riskfree[['Clsdt', 'Nrrdaydt']].rename(columns={'Clsdt': 'Day', 'Nrrdaydt': 'rfday'})
riskfree_day['rfday'] = pd.to_numeric(riskfree_day['rfday'], errors='coerce') / 100
riskfree_day['Day'] = pd.to_datetime(riskfree_day['Day'])

riskfree_mon = riskfree[['Clsdt', 'Nrrmtdt']].rename(columns={'Clsdt': 'Day', 'Nrrmtdt': 'rf'})
riskfree_mon['month'] = pd.to_datetime(riskfree_mon['Day']).dt.to_period('M').dt.to_timestamp('M')
riskfree_mon = (
riskfree_mon.assign(rf=pd.to_numeric(riskfree_mon['rf'], errors='coerce') / 100)
.groupby('month')['rf'].mean()
.reset_index()
.rename(columns={'rf': 'rfmonth'})
)

# ----------------------------
# Individual stock monthly return
# ----------------------------
ret_mon = (
pd.read_excel('Input/Individual Return/TRD_Mnth199012-202312.xlsx', skiprows=2)
[[
'Stkcd',
'Trdmnt',
'Ndaytrd',
'Msmvosd',
'Msmvttl',
'Mretwd',
'Markettype',
]]
)
ret_mon = ret_mon.rename(
columns={
'Trdmnt': 'month',
'Ndaytrd': 'Freq',
'Msmvosd': 'floatingvalue',
'Msmvttl': 'totalvalue',
'Mretwd': 'Return',
}
)
ret_mon['floatingvalue'] = pd.to_numeric(ret_mon['floatingvalue'], errors='coerce') * 1000
ret_mon['totalvalue'] = pd.to_numeric(ret_mon['totalvalue'], errors='coerce') * 1000
ret_mon['Return'] = pd.to_numeric(ret_mon['Return'], errors='coerce')
ret_mon['month'] = pd.to_datetime(ret_mon['month']).dt.to_period('M').dt.to_timestamp('M')
ret_mon['Freq'] = pd.to_numeric(ret_mon['Freq'], errors='coerce')
ret_mon['Markettype'] = pd.to_numeric(ret_mon['Markettype'], errors='coerce')
ret_mon = ret_mon[ret_mon['Markettype'].isin([1, 4, 16])]
ret_mon = ret_mon.merge(riskfree_mon, on='month', how='left')
ret_mon = ret_mon.sort_values(['Stkcd', 'month']).reset_index(drop=True)

all_months = ret_mon['month'].drop_duplicates().sort_values()


def expand_stock(df: pd.DataFrame) -> pd.DataFrame:
start = df['month'].min()
months = all_months[all_months >= start]
new_df = pd.DataFrame({'month': months})
new_df['Stkcd'] = df['Stkcd'].iloc[0]
return new_df.merge(df, on=['Stkcd', 'month'], how='left')


ret_mon = (
ret_mon.groupby('Stkcd', group_keys=False).apply(expand_stock).reset_index(drop=True)
)
ret_mon['ipomonth'] = ret_mon.groupby('Stkcd')['month'].transform('min')
ret_mon = ret_mon[ret_mon['month'] >= ret_mon['ipomonth']]
ret_mon = ret_mon.sort_values(['Stkcd', 'month'])

ret_mon['ret'] = ret_mon['Return'] - ret_mon['rfmonth']
for i in range(1, 13):
ret_mon[f'next_ret{i}'] = ret_mon.groupby('Stkcd')['ret'].shift(-i)
ret_mon['Rank'] = ret_mon.groupby('Stkcd').cumcount() + 1
ret_mon['sizef'] = np.log(ret_mon['floatingvalue'])
ret_mon['sizet'] = np.log(ret_mon['totalvalue'])

ret_mon = ret_mon[
[
'Stkcd',
'month',
'Rank',
'Freq',
'floatingvalue',
'totalvalue',
'sizef',
'sizet',
'Return',
'rfmonth',
'ret',
]
+ [f'next_ret{i}' for i in range(1, 13)]
]

# ----------------------------
# Earnings data
# ----------------------------
account_date = (
pd.read_excel('Input/Report_Date/IAR_Rept2023.xlsx', skiprows=2)
[[
'Stkcd',
'Accper',
'Annodt',
]]
.rename(columns={'Annodt': 'report_date'})
)
account_date['Accper'] = pd.to_datetime(account_date['Accper'])
account_date['report_date'] = pd.to_datetime(account_date['report_date'])
account_date['Q'] = account_date['Accper'].dt.to_period('Q').dt.to_timestamp('Q')

profit = (
pd.read_excel('Input/Profit/FS_Comins2023.xlsx', skiprows=2)
[[
'Stkcd',
'Accper',
'Typrep',
'B002000000',
]]
.dropna()
)
profit = profit.query("Typrep == 'A'").drop('Typrep', axis=1)
profit['Accper'] = pd.to_datetime(profit['Accper'])

nonrec = (
pd.read_excel('Input/Non_recurring_gains_and_losses/FI_T22023.xlsx', skiprows=2)
[[
'Stkcd',
'Accper',
'F020101',
]]
)
nonrec['Accper'] = pd.to_datetime(nonrec['Accper'])
profit = profit.merge(nonrec, on=['Stkcd', 'Accper'], how='left')
profit = profit.rename(columns={'Accper': 'Q'})
profit = profit[profit['Q'].dt.month > 1]
profit['Q'] = profit['Q'].dt.to_period('Q').dt.to_timestamp('Q')
profit = profit.sort_values(['Stkcd', 'Q'])

profit['earnings'] = np.where(
profit['F020101'].isna(),
pd.to_numeric(profit['B002000000'], errors='coerce'),
pd.to_numeric(profit['B002000000'], errors='coerce') - pd.to_numeric(profit['F020101'], errors='coerce'),
)
profit = profit.merge(account_date[['Stkcd', 'Q', 'report_date']], on=['Stkcd', 'Q'], how='left')
profit['report_date'] = profit['report_date'].dt.to_period('M').dt.to_timestamp('M')

profit['Q_earnings'] = np.nan
for code, group in profit.groupby('Stkcd'):
group = group.sort_values('Q')
earnings = []
for idx, row in group.iterrows():
if row['Q'].year < 2002:
earnings.append(np.nan)
elif row['Q'].quarter == 1:
earnings.append(row['earnings'])
else:
prev = group[group['Q'] == row['Q'] - QuarterEnd(startingMonth=12)]
if not prev.empty:
earnings.append(row['earnings'] - prev['earnings'].iloc[0])
else:
earnings.append(np.nan)
profit.loc[group.index, 'Q_earnings'] = earnings


valid_ret = ret_mon[(ret_mon['month'] >= '1991-01-01') & (~ret_mon['Return'].isna())]


def match_profit(key):
stk, m = key
test = profit[(profit['report_date'] <= m) & (profit['Stkcd'] == stk)]
if test.empty:
return pd.Series({'matchearnings': np.nan, 'recent_earnings': np.nan})
t0 = test['Q'].max()
t1 = t0 - QuarterEnd(0)
t2 = t0 - 2 * QuarterEnd(0)
t3 = t0 - 3 * QuarterEnd(0)
recent = test.loc[test['Q'] == t0, 'earnings'].values[0]
if m >= pd.Timestamp('2003-05-01'):
needed = test[test['Q'].isin([t1, t2, t3, t0])]
if len(needed) == 4 and needed['Q_earnings'].notna().all():
matche = (
needed.loc[needed['Q'] == t1, 'Q_earnings'].values[0]
+ needed.loc[needed['Q'] == t2, 'Q_earnings'].values[0]
+ needed.loc[needed['Q'] == t3, 'Q_earnings'].values[0]
+ needed.loc[needed['Q'] == t0, 'Q_earnings'].values[0]
)
else:
matche = np.nan
else:
q = t0.quarter
if q == 4:
matche = test.loc[test['Q'] == t0, 'earnings'].values[0]
elif q == 3 and not test[test['Q'] == t3].empty:
matche = test.loc[test['Q'] == t3, 'earnings'].values[0]
elif q == 2 and not test[test['Q'] == t2].empty:
matche = test.loc[test['Q'] == t2, 'earnings'].values[0]
elif q == 1 and not test[test['Q'] == t1].empty:
matche = test.loc[test['Q'] == t1, 'earnings'].values[0]
else:
matche = np.nan
return pd.Series({'matchearnings': matche, 'recent_earnings': recent})


ep_df = (
valid_ret.set_index(['Stkcd', 'month'])
.groupby(level=[0, 1])
.apply(lambda _: match_profit(_.name))
.reset_index()
)
ep_df = ep_df.merge(ret_mon[['Stkcd', 'month', 'floatingvalue', 'totalvalue']], on=['Stkcd', 'month'], how='right')
# Placeholder: load daily return data with columns Stkcd, month, ratio, Clsprc, yuemo
ret_day = pd.DataFrame()
ep_df = ep_df.merge(
ret_day[getattr(ret_day, 'yuemo', pd.Series()) == 1][['Stkcd', 'month', 'ratio', 'Clsprc']],
on=['Stkcd', 'month'],
how='left',
)
ep_df['ep'] = ep_df['matchearnings'] * pd.to_numeric(ep_df.get('ratio'), errors='coerce') / ep_df['floatingvalue']
ep_df['ep_recent'] = ep_df['recent_earnings'] / ep_df['totalvalue']

# ----------------------------
# Factor construction
# ----------------------------
ret_mon = ret_mon.merge(
ret_mon.sort_values(['Stkcd', 'month'])
.groupby('Stkcd')['Freq']
.apply(lambda s: s.rolling(window=12, min_periods=1).sum())
.reset_index(name='past12freq'),
on=['Stkcd', 'month'],
how='left',
)
ret_mon['sizebreak'] = ret_mon.groupby('month')['totalvalue'].transform(lambda x: x.quantile(0.3))

Data = ret_mon[
(ret_mon['Rank'] > 6)
& (ret_mon['past12freq'] >= 120)
& (ret_mon['Freq'] >= 10)
& (ret_mon['totalvalue'] >= ret_mon['sizebreak'])
]
Data = Data.merge(ep_df[['Stkcd', 'month', 'ep', 'ep_recent']], on=['Stkcd', 'month'])

MKT = (
Data[Data['next_ret1'].notna() & Data['totalvalue'].notna()]
.groupby('month')
.apply(lambda x: np.average(x['next_ret1'], weights=x['totalvalue']))
.reset_index(name='mkt')
)
MKT['month'] = MKT['month'] + MonthEnd(1)

Data['size.notion'] = Data.groupby('month')['sizet'].transform(
lambda x: pd.cut(x, bins=np.quantile(x.dropna(), [0, 0.5, 1]), labels=['S', 'B'], include_lowest=True)
)
Data['ep.notion'] = Data.groupby('month')['ep'].transform(
lambda x: pd.cut(x, bins=np.quantile(x.dropna(), [0, 0.3, 0.7, 1]), labels=['G', 'M', 'V'], include_lowest=True)
)

def weighted_mean(df: pd.DataFrame) -> float:
return np.average(df['next_ret1'], weights=df['totalvalue'])


Factor_sv = Data[(Data['size.notion'] == 'S') & (Data['ep.notion'] == 'V')].groupby('month').apply(weighted_mean).reset_index(name='SV')
Factor_sm = Data[(Data['size.notion'] == 'S') & (Data['ep.notion'] == 'M')].groupby('month').apply(weighted_mean).reset_index(name='SM')
Factor_sg = Data[(Data['size.notion'] == 'S') & (Data['ep.notion'] == 'G')].groupby('month').apply(weighted_mean).reset_index(name='SG')
Factor_bv = Data[(Data['size.notion'] == 'B') & (Data['ep.notion'] == 'V')].groupby('month').apply(weighted_mean).reset_index(name='BV')
Factor_bm = Data[(Data['size.notion'] == 'B') & (Data['ep.notion'] == 'M')].groupby('month').apply(weighted_mean).reset_index(name='BM')
Factor_bg = Data[(Data['size.notion'] == 'B') & (Data['ep.notion'] == 'G')].groupby('month').apply(weighted_mean).reset_index(name='BG')

Factor = (
Factor_sv.merge(Factor_sm, on='month')
.merge(Factor_sg, on='month')
.merge(Factor_bv, on='month')
.merge(Factor_bm, on='month')
.merge(Factor_bg, on='month')
)
Factor['smb'] = (Factor['SV'] + Factor['SM'] + Factor['SG']) / 3 - (
Factor['BV'] + Factor['BM'] + Factor['BG']
) / 3
Factor['vmg'] = (Factor['BV'] + Factor['SV']) / 2 - (
Factor['SG'] + Factor['BG']
) / 2
Factor = Factor[['month', 'smb', 'vmg']]
Factor['month'] = Factor['month'] + MonthEnd(1)
Factor = Factor.merge(MKT, on='month', how='left')

pre_2017 = Factor[Factor['month'] <= '2016-12-31']
print(len(pre_2017))
print(
pre_2017['smb'].mean() * 100,
pre_2017['smb'].std() * 100,
stats.ttest_1samp(pre_2017['smb'] * 100, 0),
)
print(len(pre_2017))
print(
pre_2017['vmg'].mean() * 100,
pre_2017['vmg'].std() * 100,
stats.ttest_1samp(pre_2017['vmg'] * 100, 0),
)
print(Factor[['mkt', 'smb', 'vmg']].corr())