From 6580c7eab80575616a73766035166dbfcff2e341 Mon Sep 17 00:00:00 2001 From: "jmordetsky@bloomberg.net" Date: Thu, 8 Aug 2024 07:22:29 -0400 Subject: [PATCH] data processing: * add VIX to some lists * change create-diff-series to treat all symbols as a single file before scaling * save list files, not individual series * scalers are associated with the list --- {src/alfred/docs => docs}/constituents.md | 0 docs/fundementals-to-features.md | 34 + {src/alfred/docs => docs}/fundementals.md | 0 {src/alfred/docs => docs}/useful-tools.md | 0 lists/^GSPC_constituents.csv | 504 ++++++ lists/baby_training_list.csv | 3 +- lists/symbols.csv | 1618 +------------------- lists/training_list.csv | 1 + scripts/cache-fundementals.py | 49 + scripts/cache-lists.py | 129 ++ scripts/{cache_data.py => cache-prices.py} | 15 - scripts/create-diff-series.py | 74 +- 12 files changed, 780 insertions(+), 1647 deletions(-) rename {src/alfred/docs => docs}/constituents.md (100%) create mode 100644 docs/fundementals-to-features.md rename {src/alfred/docs => docs}/fundementals.md (100%) rename {src/alfred/docs => docs}/useful-tools.md (100%) create mode 100644 lists/^GSPC_constituents.csv create mode 100644 scripts/cache-fundementals.py create mode 100644 scripts/cache-lists.py rename scripts/{cache_data.py => cache-prices.py} (71%) diff --git a/src/alfred/docs/constituents.md b/docs/constituents.md similarity index 100% rename from src/alfred/docs/constituents.md rename to docs/constituents.md diff --git a/docs/fundementals-to-features.md b/docs/fundementals-to-features.md new file mode 100644 index 00000000..e460396a --- /dev/null +++ b/docs/fundementals-to-features.md @@ -0,0 +1,34 @@ +From the quarterly financial data provided by `yfinance` (i.e., `quarterly_balance_sheet`, `quarterly_cash_flow`, `quarterly_earnings`, `quarterly_financials`, and `quarterly_income_stmt`), you can engineer several key financial ratios and metrics that are valuable for predicting stock price movements. Here’s a breakdown of potential features you can derive from each of these datasets: + +### 1. **Quarterly Balance Sheet** +- **Debt-to-Equity Ratio (D/E)**: Total Liabilities / Shareholder's Equity. This ratio provides insight into the level of a company's debt relative to its equity, indicating financial leverage and risk. +- **Current Ratio**: Current Assets / Current Liabilities. A measure of liquidity, showing how well a company can meet short-term obligations. +- **Quick Ratio (Acid Test)**: (Current Assets - Inventories) / Current Liabilities. It's a more stringent test of liquidity compared to the current ratio. +- **Book Value Per Share**: Total Equity / Number of Outstanding Shares. Indicates the equity value per share and is used to compare the book value with the market value of the shares. + +### 2. **Quarterly Cash Flow** +- **Operating Cash Flow Margin**: Operating Cash Flow / Total Revenue. This ratio measures how much cash a company generates from its operational activities relative to its revenue. +- **Free Cash Flow (FCF)**: Operating Cash Flow - Capital Expenditures. Free cash flow is an indicator of a company's ability to generate additional revenues. +- **Cash Flow Coverage Ratios**: Various ratios that use cash flow figures to assess how well a company can meet its financial obligations, such as debt payments. + +### 3. **Quarterly Earnings** +- **Earnings Per Share (EPS) Growth**: Year-over-Year growth in EPS. This is a direct measure of a company's profitability and growth over time. +- **P/E Ratio**: Market Price Per Share / Earnings Per Share. While market price is not a direct output of `quarterly_earnings`, when combined with current stock prices, this ratio is essential for valuation. + +### 4. **Quarterly Financials** +- **Return on Assets (ROA)**: Net Income / Total Assets. This indicates how efficiently a company uses its assets to generate earnings. +- **Return on Equity (ROE)**: Net Income / Shareholder's Equity. This measures the profitability of equity investments, indicating how effectively a company uses equity financing. +- **Gross Profit Margin**: Gross Profit / Total Revenue. Shows the percentage of revenue that exceeds the cost of goods sold. + +### 5. **Quarterly Income Statement** +- **Operating Margin**: Operating Income / Total Revenue. It helps to understand how much of revenue is remaining after subtracting the operating expenses. +- **Net Profit Margin**: Net Income / Total Revenue. This ratio shows how much of each dollar earned by the company is translated into profits. +- **Year-over-Year Revenue Growth**: Comparison of revenue from the same quarter in previous years to measure business growth. + +### Additional Considerations +When creating features for a machine learning model: +- **Normalization**: Consider normalizing or standardizing your financial ratios to avoid scale issues, especially when different features range over different scales. +- **Lagged Variables**: Use lagged versions of these ratios to capture the financial state in previous quarters, helping to observe trends and cyclicality without introducing look-ahead bias. +- **Differential Features**: Create features that capture changes from one quarter to the next to identify growth trends or reversals in financial health. + +Engineering these features properly allows your machine learning model to capture a holistic view of the company’s financial health and trends, providing strong predictors for stock price movements. \ No newline at end of file diff --git a/src/alfred/docs/fundementals.md b/docs/fundementals.md similarity index 100% rename from src/alfred/docs/fundementals.md rename to docs/fundementals.md diff --git a/src/alfred/docs/useful-tools.md b/docs/useful-tools.md similarity index 100% rename from src/alfred/docs/useful-tools.md rename to docs/useful-tools.md diff --git a/lists/^GSPC_constituents.csv b/lists/^GSPC_constituents.csv new file mode 100644 index 00000000..b9753b0b --- /dev/null +++ b/lists/^GSPC_constituents.csv @@ -0,0 +1,504 @@ +Symbols +MMM +AOS +ABT +ABBV +ACN +ADBE +AMD +AES +AFL +A +APD +ABNB +AKAM +ALB +ARE +ALGN +ALLE +LNT +ALL +GOOGL +GOOG +MO +AMZN +AMCR +AEE +AAL +AEP +AXP +AIG +AMT +AWK +AMP +AME +AMGN +APH +ADI +ANSS +AON +APA +AAPL +AMAT +APTV +ACGL +ADM +ANET +AJG +AIZ +T +ATO +ADSK +ADP +AZO +AVB +AVY +AXON +BKR +BALL +BAC +BK +BBWI +BAX +BDX +BRK.B +BBY +BIO +TECH +BIIB +BLK +BX +BA +BKNG +BWA +BSX +BMY +AVGO +BR +BRO +BF.B +BLDR +BG +BXP +CDNS +CZR +CPT +CPB +COF +CAH +KMX +CCL +CARR +CTLT +CAT +CBOE +CBRE +CDW +CE +COR +CNC +CNP +CF +CHRW +CRL +SCHW +CHTR +CVX +CMG +CB +CHD +CI +CINF +CTAS +CSCO +C +CFG +CLX +CME +CMS +KO +CTSH +CL +CMCSA +CAG +COP +ED +STZ +CEG +COO +CPRT +GLW +CPAY +CTVA +CSGP +COST +CTRA +CRWD +CCI +CSX +CMI +CVS +DHR +DRI +DVA +DAY +DECK +DE +DAL +DVN +DXCM +FANG +DLR +DFS +DG +DLTR +D +DPZ +DOV +DOW +DHI +DTE +DUK +DD +EMN +ETN +EBAY +ECL +EIX +EW +EA +ELV +EMR +ENPH +ETR +EOG +EPAM +EQT +EFX +EQIX +EQR +ESS +EL +ETSY +EG +EVRG +ES +EXC +EXPE +EXPD +EXR +XOM +FFIV +FDS +FICO +FAST +FRT +FDX +FIS +FITB +FSLR +FE +FI +FMC +F +FTNT +FTV +FOXA +FOX +BEN +FCX +GRMN +IT +GE +GEHC +GEV +GEN +GNRC +GD +GIS +GM +GPC +GILD +GPN +GL +GDDY +GS +HAL +HIG +HAS +HCA +DOC +HSIC +HSY +HES +HPE +HLT +HOLX +HD +HON +HRL +HST +HWM +HPQ +HUBB +HUM +HBAN +HII +IBM +IEX +IDXX +ITW +INCY +IR +PODD +INTC +ICE +IFF +IP +IPG +INTU +ISRG +IVZ +INVH +IQV +IRM +JBHT +JBL +JKHY +J +JNJ +JCI +JPM +JNPR +K +KVUE +KDP +KEY +KEYS +KMB +KIM +KMI +KKR +KLAC +KHC +KR +LHX +LH +LRCX +LW +LVS +LDOS +LEN +LLY +LIN +LYV +LKQ +LMT +L +LOW +LULU +LYB +MTB +MRO +MPC +MKTX +MAR +MMC +MLM +MAS +MA +MTCH +MKC +MCD +MCK +MDT +MRK +META +MET +MTD +MGM +MCHP +MU +MSFT +MAA +MRNA +MHK +MOH +TAP +MDLZ +MPWR +MNST +MCO +MS +MOS +MSI +MSCI +NDAQ +NTAP +NFLX +NEM +NWSA +NWS +NEE +NKE +NI +NDSN +NSC +NTRS +NOC +NCLH +NRG +NUE +NVDA +NVR +NXPI +ORLY +OXY +ODFL +OMC +ON +OKE +ORCL +OTIS +PCAR +PKG +PANW +PARA +PH +PAYX +PAYC +PYPL +PNR +PEP +PFE +PCG +PM +PSX +PNW +PNC +POOL +PPG +PPL +PFG +PG +PGR +PLD +PRU +PEG +PTC +PSA +PHM +QRVO +PWR +QCOM +DGX +RL +RJF +RTX +O +REG +REGN +RF +RSG +RMD +RVTY +ROK +ROL +ROP +ROST +RCL +SPGI +CRM +SBAC +SLB +STX +SRE +NOW +SHW +SPG +SWKS +SJM +SW +SNA +SOLV +SO +LUV +SWK +SBUX +STT +STLD +STE +SYK +SMCI +SYF +SNPS +SYY +TMUS +TROW +TTWO +TPR +TRGP +TGT +TEL +TDY +TFX +TER +TSLA +TXN +TXT +TMO +TJX +TSCO +TT +TDG +TRV +TRMB +TFC +TYL +TSN +USB +UBER +UDR +ULTA +UNP +UAL +UPS +URI +UNH +UHS +VLO +VTR +VLTO +VRSN +VRSK +VZ +VRTX +VTRS +VICI +V +VST +VMC +WRB +GWW +WAB +WBA +WMT +DIS +WBD +WM +WAT +WEC +WFC +WELL +WST +WDC +WY +WMB +WTW +WYNN +XEL +XYL +YUM +ZBRA +ZBH +ZTS diff --git a/lists/baby_training_list.csv b/lists/baby_training_list.csv index 9b8a1e07..bd485a06 100644 --- a/lists/baby_training_list.csv +++ b/lists/baby_training_list.csv @@ -5,4 +5,5 @@ VNDA SOI TER LUMN -PACK \ No newline at end of file +PACK +^VIX \ No newline at end of file diff --git a/lists/symbols.csv b/lists/symbols.csv index dbe487c0..365d7355 100644 --- a/lists/symbols.csv +++ b/lists/symbols.csv @@ -1,1619 +1,9 @@ Symbols -BLK -AMZN -CULL -OPBK -SOHO -JRSH -CTLP -LYTS -AMTX -HAS -SWK -VRCA -LUNA -VPG -GABC -ATEX -OCC -OCUP -COP -OCX -LAKE -CHMG -SMLR -WNEB -PDFS -VZ -AAOI -MYE -OSG -FUNC -SD -QUIK -ACNT -DMRC -EIGR -OFIX -GL -TDW -NWSA -TBPH -PNRG -EARN -ANIP -MNTX -ETR -MTCH -APA -ROL -CDXS -FTNT -AAPL -NEXT -KALV -BMRA -WTW -SWKS -MNDO -OPY -SLB -PACK -ZDGE -LIN -ATOM -ARAY -LL -MLR -LFVN -AFL -TTWO -PWR -SPGI -CCB -EDUC -WYY -AWK -CPB -CLFD -ECL -MOD -ZEUS -J -DCOM -SLP -ISRG -CCI -SMTI -WRB -HUSA -GCBC -RMD -ABVC -FMBH -DAL -PLYM -EVER -FDS -ALLK -AMBC -WASH -BOTJ -JILL -BKR -STIM -KOSS -IP -CPS -CPIX -AEYE -WMB -HSTM -PESI -ALOT -FEIM -NUE -NATH -CPHC -FNKO -MBWM -A -OVLY -SPOK -CUTR -MGNX -OVID -BTCS -CDXC -WWR -BRT -PFC -STR -AME -FARO -TDY -CLX -JMSB -EEX -WTI -KVHI -XBIT -LCNB -RRGB -RSSS -PLAB -BSET -AIM -ACU -THFF -NCSM -PARR -WU -ESSA -NWLI -VIA -ELMD -EFX -SRAX -MSBI -PWFL -BBW -NRXP -STE -SNDA -OCGN -VTGN -PETQ -UHAL -SLS -ASPS -AKTS -ITW -CAC -KFFB -TCMD -IOR -AES -SRTS -UNB -STRM -GLT -TRGP -PLX -REG -ISSC -MMC -HBCP -CPRT -UHS -CMRX -HST -SNPS -NSSC -SUP -EFR -GAIA -ARTNA -TACT -HON -APEI -ZYXI -VMC -PRSO -ADIL -BBIG -CME -PCAR -WNC -PCSA -ACCO -SOY -TUP -ALL -TRV -PLD -ESOA -NCMI -INVE -ATRA -CFFI -BEEM -ARE -DPZ -CAMP -LPTX -CLSD -UVSP -MCD -UBX -BWA -HONE -VIRX -BAC -BHB -IMXI -EBF -ICCC -RCKY -LMT -WHR -LINC -CSCO -RMNI -ACET -MPWR -LTRX -FET -NVNO -WAVD -CMCT -LEN -FCCO -BK -CTSH -BDX -HOV -AMCR -CCL -NWFL -ATO -PRTS -RIGL -LOVE -GOOG -OMC -ENPH -DOV -FMNB -WRLD -EMN -IDT -MNST -BBY -SMMT -FCX -ANGO -MLSS -SYY -STT -UNH -GPN -ICE -CNSL -FFIV -SAMG -INFU -BRY -FMC -CRAI -LGL -CODA -ALGN -GEN -MFIN -CTMX -EXR -UONEK -DCO -URGN -SNDX -GNPX -BLX -EBAY -PLL -XWEL -SCTL -IPI -CSSE -WATT -WSR -CIA -ASIX -ZION -SBUX -MKC -PFE -MOFG -PNC -ALTO -VALU -AKAM -EBTC -ON -HSII -SSKN -TSN -PHX -META -IRIX -ALCO -MO -MOS -FISI -NTGR -OMQS -RF -DTE -INTC -REPL -FMAO -VNCE -SIGA -AJX -SMBC -AQB -RDVT -TFX -ACNB -PRTH -GLDD -GPMT -OSUR -ADP -FTK -AVNW -CNOB -SMID -HLT -BRID -ASRT -BIO -UAL -AMGN -TPC -LYV -AINC -SGMO -BGSF -FSP -FLNT -TLYS -ALDX -LFMD -NHTC -EXPE -ISDR -FSFG -ASPN -CLDT -VATE -AADI -RICK -UNTY -EVLO -DGII -CALB -WEYS -CRL -MCRI -NTRS -UONE -ACN -EGIO -ULTA -JCI -AEE -OSW -STEL -CELC -NEM -ROST -ICHR -DX -CVM -CRM -FNGR -MYMD -VTLE -DE -SCPH -QUAD -F -DXC -VSEC -BWB -KMI -XRAY -CULP -DRI -TNK -VRTX -FSLR -NBN -CI -CFG -GERN -NFBK -LBAI -PRTG -KNSA -IVR -FLXS -AIG -MHH -NFLX -FNWD -CHRS -MVIS -PIRS -PMTS -STX -MPB -CCRD -NI -SEAC -AVRO -AIRT -IZEA -SGC -AIZ -PW -CVLY -WSBF -TSLA -ULBI -IVZ -AOSL -IESC -HAYN -SRRK -GWW -TCS -NTIP -ED -ITI -AXP -PFIS -SOTK -BPRN -LEE -OCN -TPB -KMX -MBOT -BEN -HAL -INBK -ETN -CAG -DUK -LQDT -CHEF -TUSK -ASYS -GNE -RNGR -ADM -HMST -RCKT -ORGS -NAII -CMG -FRBA -ORLY -ORMP -MCB -CLIR -PYPL -RFL -NECB -TILE -INSW -WAB -ENZ -TTI -PCG -VTRS -ALRS -AMD -NTWK -PLBC -MTRX -GNSS -HROW -SRI -COST -ANIX -ZBRA -ATRO -BA -DLTH -AMNB -GILD -NAT -MVBF -OXBR -ACTG -PH -ONCT -STRS -CMCO -LBRDK -AGAE -GALT -PAYX -OCUL -MOH -ALLE -CMCL -FVCB -HTBK -GD -SGA -NMRD -CDNS -WYNN -CHMI -JOUT -GRPN -KIM -LUV -IVAC -DVAX -CIZN -POM -IR -EYPT -DAKT -MCBC -IDN -RL -PFIE -DCTH -CPSS -MKTX -NSC -TITN -GHM -CTXR -ES -PED -AGYS -UFPT -PGC -SENEA -DYAI -ZIVO -LARK -AORT -LPTH -HBB -GRWG -DRIO -WHG -FIXX -CPRX -BDL -OLP -BYFC -SCVL -DVN -HYMC -CRIS -AXTI -DZSI -RHI -HSY -LASR -DGX -OBT -NDAQ -BOC -MCS -MTEM -ANAB -EZPW -WMPN -GLBZ -NATR -ABUS -DD -DCPH -INTT -ADVM -VLGEA -HIBB -ULH -TYL -LCUT -TSCO -SGMA -EQBK -ABBV -PSX -PETS -HSON -MRNS -EVI -TEL -SNCR -BAH -PKOH -BR -CRMD -CCEL -MOV -CBRE -KINS -MPAA -CODX -NSYS -AMSC -CMS -CIX -DHX -HALL -COF -CATO -KEY -PFMT -EA -OXY -PAYC -ADMA -STCN -SELF -LW -STKS -KFS -AVXL -FRBK -ENG -ELV -ZBH -RYAM -INTZ -SALM -CMT -LCTX -ATLO -ATOS -POWL -OPRX -GM -NWS -PLOW -DXCM -LWAY -DIS -SYK -XAIR -FBIO -SYRS -BH -MTD -WLFC -PLPC -RTX -PFBC -DFS -CUE -TRST -CZNC -CZR -OPTN -SHBI -CE -AJG -SND -GRBK -VYGR -MDT -NBSE -ICD -FLIC -RSG -PCYO -RFIL -SEDG -PAVM -AXDX -CFBK -WBD -UDR -ATLC -GOOD -AMSWA -ACGL -XOMA -GMRE -TRNS -IMUX -DWSN -FCBC -FUV -NCLH -MBRX -CASS -AQMS -MDWD -MPC -CCRN -LIFE -HY -FUSB -MGTX -FRST -WEC -CLW -MET -INGN -CHTR -BW -CWBC -RM -CAL -WAT -OOMA -LE -LOW -CEVA -ROP -WTBA -TECH -ADBE -DELL -STRL -SCX -NX -GEOS -PARA -CVX -WRK -URE -CCLD -CDMO -LNG -VANI -KEYS -UVE -DHIL -MU -RELL -ISTR -O -EGAN -MTB -HEPA -TROW -BFI -UBFO -EXPD -EOLS -HOFT -BNED -APD -HAFC -BMY -EQR -FAST -IT -HLIT -UEC -KZR -FRT -GLRE -FAT -LPCN -IRMD -FANG -PAYS -UTMD -FONR -SFST -OPOF -PMCB -CSPI -CSGP -DSKE -PVBC -SNA -THRY -POWW -KODK -VTSI -MRAM -BLFS -BWFG -BYRN -NGVC -SEE -DLA -INOD -RMBL -WFCF -SFBC -AON -HPQ -HVT -WLDN -REPX -PI -RVSB -FRAF -SVRA -MSVB -GSBC -CPT -RLGT -AAP -VTR -ANSS -HFBL -KULR -VERO -BAND -DENN -ROK -TSBK -CDZI -ATNF -PWOD -MNKD -MSFT -VRA -AOS -VFC -FFNW -CVS -CVCY -PLSE -ARL -VLO -MMM -EFC -LXRX -AGS -RYI -WINA -RZLT -EXPR -QCRH -KDP -CAPR -BBGI -IIIV -ATEN -ADSK -MG -CRWS -KR -FDBC -NVDA -MS -JAKK -SBT -WELL -FRD -EPSN -LLY -TBBK -LPG -CMTL -EVC -KMB -CHUY -VRDN -GLW -SMMF -CSX -OKE -CTRN -RJF -KNX -CCBG -COFS -BOOM -VABK -CURO -HZO -NOC -TMUS -VVI -LFT -WM -MACK -EGY -GNK -OPTT SOI -KELYA -XOM -FFIC -CLDX -TJX -PBPB -PMD -DVA -TT -KIRK -VRAY -BAX -BSX -HD -TRMB -HBAN -FNWB -BJRI -UHT -ABT -CVV -RCMT -YEXT -APH -AAME -NR -MCHP -CLSK -KO -DMTK -SHW -ICAD -HCA -VVX -METC -BCML -NVEC -PEG -ANET -SRE -ALK -CMCSA -CTLT -SLNG -CCO -EL -AWRE -PFG -ODC -ASUR -EPAM -FRPH -GOOGL -FORR -CBFV -IBM -ERIE -ESS -RVP -PGEN -CLAR -CNTY -VERU -BFC -URI -SBOW -INCY -SCYX -PRDO -EGRX -AEHR -SLNH -BY -EMKR -XXII -SJM -MAA -KIDS -YORW -RDNT -HFFG -MRIN -ARR -PRU -NOTV -IMMR -SBFG -UNP -CF -SMSI -LOAN -REFR -HLX -FSBW -COLL -NKSH -INO -MXC -VXRT -LKQ -TZOO -WCN -PTC -EML -FHN -DARE -JNPR -AYRO -KOPN -MRO -BFST -GEG -PDEX -STXS -T -AE -KBAL -DSGR -GIFI -PXLW -BKSC -XFOR -LNT -EVBN -ENSV -GNRC -MCRB -WY -SACH -SPWH -AGX -HIG -DLR -SP -K -UG -OESX -XERS -HCI -SHYF -IPWR -NKE -EIX -AMPY -FTV -OVBC -IQV -CBAY -FBIZ -SYPR -TCI -AVB -VOXX -AQST -CVLG -AZO -AFMD -CRDF -COGT -JPM -HES -INTG -MSI -LSBK -CNP -TMO -CLPT -KLAC -HWBK -HPE -FKWL -PHM -XEL -CBAN -EVRG -BCLI -BZH -GSIT -VICI -GREE -LOCO -GEO -WMT -FIS -NRIM -CLPR -GEVO -QCOM -PKG -AVD -ASC -MULN -CATC -THR -UMH -RMCF -MDLZ -ESCA -AGM -IBCP -CHD -GWRS -NODK -QRHC -L +BBIG TER -HWM -MHK -RMAX -SSTI -EQT -ANIK -LFCR -ADI -ESQ -LNC -ATXS -DGLY -MRK -ORC -DXLG -ESP -ATNI -PRTA -HBNC -SYBX -POOL -ASMB -VSTM -BRFH -GORO -ELA -TBNK -SYF -SXC -BRO -UEIC -LVS -DHC -TXT -PKBK -ARC -INSE -PBYI -BLBD -PSA -SWBI -PROV -FE -PNR -MAYS -DG -PGR -PAHC -POLA -BFIN -MBCN -WVVI -KOP -TWIN -INVH -CINF -ORCL -OMER -ETD -PPIH -GS -BKTI -FNCB -LINK -VERI -BCBP -REI -CTSO -TDG -KEQU -ARW -EBMT -GDEN -IRM -GTIM -SBNY -HOLX -HIFS -BKNG -REKR -EW -LDOS -LAND -CSBR -PDLB -AFG -VIRC -AXR -FLWS -LTBR -HMNF -IEX -VRSK -FARM -PXD -RBBN -RBCAA -WKHS -CL -CARE -VBFC -ERNA -RGCO -AVY -NESR -TRC -BSRR -LUMO -FC -D -PKE -VHC -TG -NBIX -GRMN -OMEX -MAR -CIVB -HHS -PBHC -WRAP -CNC -EP -PTGX -PPG -HCKT -APT -CDW -CENN -TK -PEBO -IDXX -NL -ABIO -STRT -MCHX -EXC -CXDO -AREC -WTTR -CHRW -UFCS -DIT -KRMD -CARA -LMB -NOW -PG -JNJ -AEP -UFI -CTAS -AAL -CAT -WULF -BRBS -JKHY -EHTH -PRPH -SNFCA -MNOV -SENS -MYFW -MITT -LMNR -NGS -GMGI -PHUN -UTL -QNST -LBTYK -CIO -TIPT -TCON -RGS -HURC -FLT -BTBT -ZUMZ -SIEB -DJCO -SCHW -STZ -SGH -KPTI -GIS -AROW -CBOE -SDPI -VB -UBCP -FF -VTOL -RCL -SANW -SBAC -APTV -WVE -KE -UPS -BBWI -IFF -FLL -BCOV -GENC -ELDN -MPX -HGBL -AHT -WST -BTTR -ISUN -TSQ -AREN -TARA -EYEN -JYNT -FOR -OFED -FNLC -REVG -VNRX -CSTE -PANL -ODFL -MHLD -NWL -OSS -DMAC -MLP +PACK +PHM LUMN -SRDX -INTU -BMRN -VKTX -ALT -CRMT -TAP -INSG -SMBK -TCX -MDXG -GTBP -HRTX -MTW -GVP -MSCI -ATEC -BLUE -HNRG -TH -III -FSTR -CMA -VTNR -ZTS -FSI -HZNP -RCEL -SB VNDA -FTEK -HBIO -CRVS -CPF -ACRE -RCAT -KTCC -BXC -DLTR -WFC -AMAT -CONN -WBA -CB -CTHR -ARMP -HNNA -CSV -ORRF -HQI -CVGW -TPIC -CASA -FCAP -RBB -LHX -NRG -OIS -BMRC -MGYR -CARV -GCO -DFIN -EQIX -CAH -DXPE -BTCY -AUBN -AC -CHCI -EOG -MLM -RGP -C -YUM -IPG -PTSI -CRNX -BELFB -XYL -QMCO -NC -NIC -IIIN -CECO -BXP -FCUV -AXGN -FOSL -QRVO -ATNM -PRPL -BBCP -CLMB -LAZY -FBMS -BIIB -MA -ESPR -MCO -OTLK -AGEN -ALB -DLHC -LIVE -STLD -PPSI -VRSN -BOXL -QUBT -CVGI -CCNE -BLBX -TAST -TISI -RVYL -BALL -HUM -TPR -CKPT -BHR -NTIC -CTRA -BYSI -NVR -LSEA -DXYN -HSIC -MRBK -TGT -RDI -LRMR -COO -ARCT -REGN -LIND -ENVA -SPTN -HRTG -SWKH -BGFV -DHI -MBI -ACMR -PPL -FGBI -FXNC -NEE -ORN -VHI -CMI -PEBK -OSBC -NINE -AIRG -CWCO -ASRV -MRSN -ETSY -NTAP -NICK -CTO -SCSC -GPC -TXN -GTHX -CZWI -RLMD -GCI -SMHI -SPRO -AVGO -TWI -OSPN -CTS -KHC -CTBI -MCK -LYB -MTEX -DGICA -BWEN -LTRPA -MCFT -AFBI -LODE -SCOR -HTBI -ILMN -AMT -FFBW -PLCE -VUZI -MAS -VBIV -NDLS -BLNK -MITK -MNSB -CSTR -PNW -JBHT -PDSB -EGLE -ACR -BBSI -FDX -NWPX -HWKN -MBIN -LRCX -PEP -SURG -PM -RAIL -UIS -FITB -TFC -CPSH -HEAR -ICCH -GNTY -NDSN -NPK -EPM -HDSN -TPHS -AP -PCYG -WW -HRL -ERII -ALPN -BTAI -CARS -IDR -SPG -FPI -LQDA -LAB -TTSH -NXPI -NNBR -DHR -GE -LEU -QIPT -FFWM -EMR -LH -LXFR -WDC -MGM -REX -ITIC -SSBI -AMRK -SO -RYTM -HII -LMAT -UPLD -ACRS +^VIX \ No newline at end of file diff --git a/lists/training_list.csv b/lists/training_list.csv index b58341eb..9ea9b8a0 100644 --- a/lists/training_list.csv +++ b/lists/training_list.csv @@ -1674,3 +1674,4 @@ MOS SGMO PARA VTR +^VIX \ No newline at end of file diff --git a/scripts/cache-fundementals.py b/scripts/cache-fundementals.py new file mode 100644 index 00000000..306ef5ea --- /dev/null +++ b/scripts/cache-fundementals.py @@ -0,0 +1,49 @@ +import pandas as pd +import yfinance as yf +import os +import argparse + + +def fetch_fundamentals(symbol): + stock = yf.Ticker(symbol) + df_financials = stock.quarterly_financials.transpose() + df_balance_sheet = stock.quarterly_balance_sheet.transpose() + df_cash_flow = stock.quarterly_cash_flow.transpose() + df_fundamentals = pd.concat([df_financials, df_balance_sheet, df_cash_flow], axis=1) + + return df_fundamentals + + +def main(symbols_file, data_dir): + df_symbols = pd.read_csv(symbols_file) + for symbol in df_symbols['Symbols']: + print(f"Processing {symbol}") + df_fundamentals = fetch_fundamentals(symbol) + + price_file_path = os.path.join(data_dir, f"{symbol}.csv") + + if os.path.exists(price_file_path): + df_prices = pd.read_csv(price_file_path) + df_fundamentals.index = pd.to_datetime(df_fundamentals.index) + df_prices.index = pd.to_datetime(df_prices['Date']) + + # Merging data + df_combined = df_prices.join(df_fundamentals, how='outer') + df_combined.fillna(method='ffill', inplace=True) + + # Writing to CSV + output_path = os.path.join(data_dir, f"{symbol}_fundamentals.csv") + df_combined.to_csv(output_path) + print(f"Written combined data to {output_path}") + else: + print(f"Pricing data file not found for {symbol}") + + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description="Fetch and process stock fundamentals and pricing data.") + parser.add_argument("--symbol-file", type=str, help="Path to the CSV file containing stock symbols") + parser.add_argument("--data-dir", default="./data", type=str, + help="Directory to look for pricing data and save output") + + args = parser.parse_args() + main(args.symbol_file, args.data_dir) diff --git a/scripts/cache-lists.py b/scripts/cache-lists.py new file mode 100644 index 00000000..e87cc804 --- /dev/null +++ b/scripts/cache-lists.py @@ -0,0 +1,129 @@ +#!/usr/bin/env python3 +import argparse +import requests +from bs4 import BeautifulSoup +import csv +import re + +def list_indexes(): + url = "https://uk.finance.yahoo.com/world-indices/" + headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'} + response = requests.get(url, headers=headers) + soup = BeautifulSoup(response.text, 'html.parser') + + table = soup.find('table') + if not table: + print("Failed to find indices table on Yahoo Finance.") + return + + indices = {} + for row in table.find_all('tr')[1:]: + cells = row.find_all('td') + if len(cells) >= 2: + symbol = cells[0].text.strip() + name = cells[1].text.strip() + indices[symbol] = name + + for symbol, name in indices.items(): + print(f"{symbol}: {name}") + + +index_urls = { + "^FTSE": "https://en.wikipedia.org/wiki/FTSE_100_Index", # FTSE 100 + "^GSPC": "https://en.wikipedia.org/wiki/List_of_S%26P_500_companies", # S&P 500 + "^DJI": "https://en.wikipedia.org/wiki/Dow_Jones_Industrial_Average", # Dow Jones Industrial Average + "^IXIC": "https://en.wikipedia.org/wiki/NASDAQ_Composite", # NASDAQ Composite + "^GDAXI": "https://en.wikipedia.org/wiki/DAX", # DAX PERFORMANCE-INDEX + "^FCHI": "https://en.wikipedia.org/wiki/CAC_40", # CAC 40 + "^N225": "https://en.wikipedia.org/wiki/Nikkei_225", # Nikkei 225 + "^HSI": "https://en.wikipedia.org/wiki/Hang_Seng_Index", # HANG SENG INDEX + "000001.SS": "https://en.wikipedia.org/wiki/SSE_Composite_Index", # SSE Composite Index + "^AXJO": "https://en.wikipedia.org/wiki/S%26P/ASX_200", # S&P/ASX 200 + "^GSPTSE": "https://en.wikipedia.org/wiki/S%26P/TSX_Composite_Index", # S&P/TSX Composite index + "^RUT": "https://en.wikipedia.org/wiki/Russell_2000_Index", # Russell 2000 + "^VIX": "https://en.wikipedia.org/wiki/VIX", # CBOE Volatility Index + "^STOXX50E": "https://en.wikipedia.org/wiki/EURO_STOXX_50", # ESTX 50 PR.EUR + "^N100": "https://en.wikipedia.org/wiki/Euronext_100", # Euronext 100 Index + "^BFX": "https://en.wikipedia.org/wiki/BEL20", # BEL 20 + "IMOEX.ME": "https://en.wikipedia.org/wiki/MOEX_Russia_Index", # MOEX Russia Index + "^NYA": "https://en.wikipedia.org/wiki/NYSE_Composite", # NYSE COMPOSITE (DJ) + "^XAX": "https://en.wikipedia.org/wiki/NYSE_American", # NYSE AMEX COMPOSITE INDEX + "^STI": "https://en.wikipedia.org/wiki/Straits_Times_Index", # STI Index + "^BSESN": "https://en.wikipedia.org/wiki/BSE_SENSEX", # S&P BSE SENSEX + "^JKSE": "https://en.wikipedia.org/wiki/IDX_Composite", # IDX COMPOSITE + "^KLSE": "https://en.wikipedia.org/wiki/FTSE_Bursa_Malaysia_KLCI", # FTSE Bursa Malaysia KLCI + "^NZ50": "https://en.wikipedia.org/wiki/NZX_50_Index", # S&P/NZX 50 INDEX GROSS + "^KS11": "https://en.wikipedia.org/wiki/KOSPI", # KOSPI Composite Index + "^TWII": "https://en.wikipedia.org/wiki/Taiwan_Capitalization_Weighted_Stock_Index", # TSEC weighted index + "^BVSP": "https://en.wikipedia.org/wiki/Ibovespa", # IBOVESPA + "^MXX": "https://en.wikipedia.org/wiki/Índice_de_Precios_y_Cotizaciones", # IPC MEXICO + "^IPSA": "https://en.wikipedia.org/wiki/IPSA", # S&P IPSA + "^MERV": "https://en.wikipedia.org/wiki/MERVAL", # MERVAL + "^TA125.TA": "https://en.wikipedia.org/wiki/Tel_Aviv_125", # TA-125 + "^CASE30": "https://en.wikipedia.org/wiki/EGX_30", # EGX 30 Price Return Index + "^NSEI": "https://en.wikipedia.org/wiki/NIFTY_50" # NIFTY 50 +} + + +def scrape_constituents(index, url): + headers = {'User-Agent': 'Mozilla/5.0'} + response = requests.get(url, headers=headers) + soup = BeautifulSoup(response.text, 'html.parser') + constituents = [] + + # Find the first table on the Wikipedia page + table = soup.find('table', {'class': 'wikitable sortable'}) + if table: + headers = [th.text.strip() for th in table.find('tr').find_all('th')] + # Find the index of the column with 'Ticker', 'Symbol', or 'Ticker Symbol' + column_index = None + for header in ['Ticker', 'Symbol', 'Ticker Symbol']: + if header in headers: + column_index = headers.index(header) + break + if column_index is None: + raise ValueError("No 'Ticker', 'Symbol', or 'Ticker Symbol' column found in the table") + + rows = table.find_all('tr')[1:] # skip header row + for row in rows: + cells = row.find_all('td') + if cells: + ticker = cells[column_index].text.strip() + # Remove any exchange prefix from the ticker symbol + ticker = re.sub(r'^\w+:(\w{2,4})$', r'\1', ticker) + constituents.append(ticker) + + return constituents + +def save_to_csv(index, constituents, dir): + filename = f"{dir}/{index}_constituents.csv" + with open(filename, 'w', newline='', encoding='utf-8') as file: + writer = csv.writer(file) + writer.writerow(['Symbols']) + for constituent in constituents: + writer.writerow([constituent]) + print(f"Saved constituents of {index} to {filename}") + +parser = argparse.ArgumentParser(description="Cache lists of index constituents from Yahoo Finance") +parser.add_argument("-l", "--list", action='store_true', help="List all available indexes") +parser.add_argument("-i", "--index", type=str, help="Index symbol to cache constituents for") +parser.add_argument("-d", "--directory", default="./lists", help="Directory to save the cached CSV file") +parser.add_argument("-dr", "--dry-run", action='store_true', help="Perform a dry run to list index constituents without downloading") + +args = parser.parse_args() + +if args.list: + list_indexes() +elif args.index: + if args.index not in index_urls: + print(f"I don't know {args.index} sorry :(") + elif args.dry_run: + members = scrape_constituents(args.index, index_urls[args.index]) + for member in members: + print(member) + else: + members = scrape_constituents(args.index, index_urls[args.index]) + save_to_csv(args.index, members, args.directory) + +else: + print("Requires --index to download or --dry-run to list members or --list to show available lists") \ No newline at end of file diff --git a/scripts/cache_data.py b/scripts/cache-prices.py similarity index 71% rename from scripts/cache_data.py rename to scripts/cache-prices.py index d171a598..fbf25f55 100755 --- a/scripts/cache_data.py +++ b/scripts/cache-prices.py @@ -1,6 +1,5 @@ #!/usr/bin/env python3 import pandas as pd -import random import argparse from alfred.data import download_ticker_list @@ -10,21 +9,9 @@ parser.add_argument("-fo", "--symbol-file-out", default="./lists/symbols.csv", help="Output file - all bad tickers trimmed") parser.add_argument("-o", "--output-dir", default="./data", help="Output directory (default: ./data)") -parser.add_argument("-rs", "--random-spys", type=int, default=None, help="Number of random stocks to select from SPY") args = parser.parse_args() -def rando_spys(num): - sp_assets = pd.read_html( - 'https://en.wikipedia.org/wiki/List_of_S%26P_500_companies')[0] - assets = sp_assets['Symbol'].str.replace('.', '-').tolist() - - # Select num random symbols from SPY - random_symbols = random.sample(assets, num) - - return random_symbols - - def load_symbols_from_file(file): return pd.read_csv(file)["Symbols"].tolist() @@ -36,8 +23,6 @@ def load_symbols_from_file(file): if args.symbol_file is not None: symbols += load_symbols_from_file(args.symbol_file) -if args.random_spys is not None: - symbols += rando_spys(args.random_spys) symbols = list(set(symbols)) bad_symbols = download_ticker_list(symbols, args.output_dir, interval="1wk") diff --git a/scripts/create-diff-series.py b/scripts/create-diff-series.py index 5e0e43a7..56a2a899 100755 --- a/scripts/create-diff-series.py +++ b/scripts/create-diff-series.py @@ -1,20 +1,17 @@ #!/usr/bin/env python3 from alfred.data import read_symbol_file -from alfred.data import attach_moving_average_diffs, scale_relevant_training_columns, attach_profits +from alfred.data import attach_moving_average_diffs, scale_relevant_training_columns import argparse import os import joblib import pandas as pd - def main(): parser = argparse.ArgumentParser() parser.add_argument('--symbols', type=str, help="Symbols to use separated by comma") - parser.add_argument('--symbol-file', type=str, default="./lists/training_list.csv", help="List of symbols in a file") + parser.add_argument('--symbol-file', type=str, help="List of symbols in a file") parser.add_argument('--data', type=str, default="./data", help="data dir (./data)") - parser.add_argument('--windows', type=str, default="8,12,24", help="profit windows , separated") - parser.add_argument('--bars', type=str, default="weekly", help="bar type") - + parser.add_argument('--pred', type=int, nargs="+", help="A space separated list of prediction periods in days") args = parser.parse_args() symbols = [] @@ -23,25 +20,68 @@ def main(): else: symbols += pd.read_csv(args.symbol_file)["Symbols"].tolist() - windows = args.windows.split(',') - windows = [int(window) for window in windows] - + ticker_data_frames = [] for symbol in symbols: print("pre-processing: ", symbol) df = read_symbol_file(args.data, symbol) if df is None: continue + # fix dates + df['Date'] = df['Date'].apply(lambda x: str(x)) + df['Date'] = pd.to_datetime(df['Date']) + + # attach moving averages df, columns = attach_moving_average_diffs(df) - columns.extend(["Close"]) - df, scaler = scale_relevant_training_columns(df, columns) - df = df[columns].dropna() - df = attach_profits(windows, args.bars, df) - processed_file = os.path.join(args.data, f"{symbol}_diffs.csv") - df.to_csv(processed_file) - # todo: graph the diffs - make sure they look the way they should - joblib.dump(scaler, os.path.join(args.data, f'{symbol}_scaler.save')) + # attach the labels for price movement + for pred in args.pred: + label = f'label_change_term_{pred}' + df[label] = df['close'].pct_change(periods=pred).shift( + periods=(-1 * pred)) + columns.append(label) + + # capture the close price + columns.append("Close") + + # drop columns we don't want. We need columns untouched for later + temp_columns = columns + ["Date"] + df = df[temp_columns] + + # index by symbol + df["Symbol"] = symbol + + # prepare to merge all + ticker_data_frames.append(df) + + final_df = pd.concat(ticker_data_frames) + + # sort by date, then ticker 1,msft, 1,aapl | 2,msft etc + final_df = final_df.sort_values(by=['Date', 'Symbol']) + + # fill na + final_df = final_df.ffill().bfill() + + # save unscaled interim path + directory = os.path.dirname(args.symbol_file) + base_name = os.path.basename(args.symbol_file) + file_name, file_extension = os.path.splitext(base_name) + new_file_name = f"{file_name}_processed_unscaled{file_extension}" + new_file_path = os.path.join(directory, new_file_name) + final_df.to_csv(new_file_path) + + # continue scaling + final_df, scaler = scale_relevant_training_columns(final_df, columns) + + # save the scaled data file (final) + new_file_name = f"{file_name}_processed_scaled{file_extension}" + new_file_path = os.path.join(directory, new_file_name) + df.to_csv(new_file_path) + + # save the scaler + new_file_name = f"{file_name}_scaler.save" + new_file_path = os.path.join(directory, new_file_name) + joblib.dump(scaler, os.path.join(args.data, new_file_path)) if __name__ == "__main__":