Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
AI4Finance-Foundation
GitHub Repository: AI4Finance-Foundation/FinRL
Path: blob/master/finrl/meta/preprocessor/yahoodownloader.py
732 views
1
"""Contains methods and classes to collect data from
2
Yahoo Finance API
3
"""
4
5
from __future__ import annotations
6
7
import pandas as pd
8
import yfinance as yf
9
10
11
class YahooDownloader:
12
"""Provides methods for retrieving daily stock data from
13
Yahoo Finance API
14
15
Attributes
16
----------
17
start_date : str
18
start date of the data (modified from neofinrl_config.py)
19
end_date : str
20
end date of the data (modified from neofinrl_config.py)
21
ticker_list : list
22
a list of stock tickers (modified from neofinrl_config.py)
23
24
Methods
25
-------
26
fetch_data()
27
Fetches data from yahoo API
28
29
"""
30
31
def __init__(self, start_date: str, end_date: str, ticker_list: list):
32
self.start_date = start_date
33
self.end_date = end_date
34
self.ticker_list = ticker_list
35
36
def fetch_data(self, proxy=None, auto_adjust=False) -> pd.DataFrame:
37
"""Fetches data from Yahoo API
38
Parameters
39
----------
40
41
Returns
42
-------
43
`pd.DataFrame`
44
7 columns: A date, open, high, low, close, volume and tick symbol
45
for the specified stock ticker
46
"""
47
# Download and save the data in a pandas DataFrame:
48
data_df = pd.DataFrame()
49
num_failures = 0
50
for tic in self.ticker_list:
51
temp_df = yf.download(
52
tic,
53
start=self.start_date,
54
end=self.end_date,
55
proxy=proxy,
56
auto_adjust=auto_adjust,
57
)
58
if temp_df.columns.nlevels != 1:
59
temp_df.columns = temp_df.columns.droplevel(1)
60
temp_df["tic"] = tic
61
if len(temp_df) > 0:
62
# data_df = data_df.append(temp_df)
63
data_df = pd.concat([data_df, temp_df], axis=0)
64
else:
65
num_failures += 1
66
if num_failures == len(self.ticker_list):
67
raise ValueError("no data is fetched.")
68
# reset the index, we want to use numbers as index instead of dates
69
data_df = data_df.reset_index()
70
try:
71
# convert the column names to standardized names
72
data_df.rename(
73
columns={
74
"Date": "date",
75
"Adj Close": "adjcp",
76
"Close": "close",
77
"High": "high",
78
"Low": "low",
79
"Volume": "volume",
80
"Open": "open",
81
"tic": "tic",
82
},
83
inplace=True,
84
)
85
86
if not auto_adjust:
87
data_df = self._adjust_prices(data_df)
88
except NotImplementedError:
89
print("the features are not supported currently")
90
# create day of the week column (monday = 0)
91
data_df["day"] = data_df["date"].dt.dayofweek
92
# convert date to standard string format, easy to filter
93
data_df["date"] = data_df.date.apply(lambda x: x.strftime("%Y-%m-%d"))
94
# drop missing data
95
data_df = data_df.dropna()
96
data_df = data_df.reset_index(drop=True)
97
print("Shape of DataFrame: ", data_df.shape)
98
# print("Display DataFrame: ", data_df.head())
99
100
data_df = data_df.sort_values(by=["date", "tic"]).reset_index(drop=True)
101
102
return data_df
103
104
def _adjust_prices(self, data_df: pd.DataFrame) -> pd.DataFrame:
105
# use adjusted close price instead of close price
106
data_df["adj"] = data_df["adjcp"] / data_df["close"]
107
for col in ["open", "high", "low", "close"]:
108
data_df[col] *= data_df["adj"]
109
110
# drop the adjusted close price column
111
return data_df.drop(["adjcp", "adj"], axis=1)
112
113
def select_equal_rows_stock(self, df):
114
df_check = df.tic.value_counts()
115
df_check = pd.DataFrame(df_check).reset_index()
116
df_check.columns = ["tic", "counts"]
117
mean_df = df_check.counts.mean()
118
equal_list = list(df.tic.value_counts() >= mean_df)
119
names = df.tic.value_counts().index
120
select_stocks_list = list(names[equal_list])
121
df = df[df.tic.isin(select_stocks_list)]
122
return df
123
124