CoCalc -- env_stocktrading

GitHub Repository: AI4Finance-Foundation/FinRL
Path: blob/master/finrl/meta/env_stock_trading/env_stocktrading_cashpenalty.py
⁷³² views
1
from __future__ import annotations
2

3
import random
4
import time
5
from copy import deepcopy
6

7
import gym
8
import matplotlib
9
import numpy as np
10
import pandas as pd
11
from gym import spaces
12
from stable_baselines3.common import logger
13
from stable_baselines3.common.vec_env import DummyVecEnv
14
from stable_baselines3.common.vec_env import SubprocVecEnv
15

16
matplotlib.use("Agg")
17

18

19
class StockTradingEnvCashpenalty(gym.Env):
20
    """
21
    A stock trading environment for OpenAI gym
22
    This environment penalizes the model for not maintaining a reserve of cash.
23
    This enables the model to manage cash reserves in addition to performing trading procedures.
24
    Reward at any step is given as follows
25
        r_i = (sum(cash, asset_value) - initial_cash - max(0, sum(cash, asset_value)*cash_penalty_proportion-cash))/(days_elapsed)
26
        This reward function takes into account a liquidity requirement, as well as long-term accrued rewards.
27
    Parameters:
28
        df (pandas.DataFrame): Dataframe containing data
29
        buy_cost_pct (float): cost for buying shares
30
        sell_cost_pct (float): cost for selling shares
31
        hmax (int, array): maximum cash to be traded in each trade per asset. If an array is provided, then each index correspond to each asset
32
        discrete_actions (bool): option to choose whether perform dicretization on actions space or not
33
        shares_increment (int): multiples number of shares can be bought in each trade. Only applicable if discrete_actions=True
34
        turbulence_threshold (float): Maximum turbulence allowed in market for purchases to occur. If exceeded, positions are liquidated
35
        print_verbosity(int): When iterating (step), how often to print stats about state of env
36
        initial_amount: (int, float): Amount of cash initially available
37
        daily_information_columns (list(str)): Columns to use when building state space from the dataframe. It could be OHLC columns or any other variables such as technical indicators and turbulence index
38
        cash_penalty_proportion (int, float): Penalty to apply if the algorithm runs out of cash
39
        patient (bool): option to choose whether end the cycle when we're running out of cash or just don't buy anything until we got additional cash
40

41
    RL Inputs and Outputs
42
        action space: [<n_assets>,] in range {-1, 1}
43
        state space: {start_cash, [shares_i for in in assets], [[indicator_j for j in indicators] for i in assets]]}
44
    TODO:
45
        Organize functions
46
        Write README
47
        Document tests
48
    """
49

50
    metadata = {"render.modes": ["human"]}
51

52
    def __init__(
53
        self,
54
        df,
55
        buy_cost_pct=3e-3,
56
        sell_cost_pct=3e-3,
57
        date_col_name="date",
58
        hmax=10,
59
        discrete_actions=False,
60
        shares_increment=1,
61
        turbulence_threshold=None,
62
        print_verbosity=10,
63
        initial_amount=1e6,
64
        daily_information_cols=["open", "close", "high", "low", "volume"],
65
        cache_indicator_data=True,
66
        cash_penalty_proportion=0.1,
67
        random_start=True,
68
        patient=False,
69
        currency="$",
70
    ):
71
        self.df = df
72
        self.stock_col = "tic"
73
        self.assets = df[self.stock_col].unique()
74
        self.dates = df[date_col_name].sort_values().unique()
75
        self.random_start = random_start
76
        self.discrete_actions = discrete_actions
77
        self.patient = patient
78
        self.currency = currency
79
        self.df = self.df.set_index(date_col_name)
80
        self.shares_increment = shares_increment
81
        self.hmax = hmax
82
        self.initial_amount = initial_amount
83
        self.print_verbosity = print_verbosity
84
        self.buy_cost_pct = buy_cost_pct
85
        self.sell_cost_pct = sell_cost_pct
86
        self.turbulence_threshold = turbulence_threshold
87
        self.daily_information_cols = daily_information_cols
88
        self.state_space = (
89
            1 + len(self.assets) + len(self.assets) * len(self.daily_information_cols)
90
        )
91
        self.action_space = spaces.Box(low=-1, high=1, shape=(len(self.assets),))
92
        self.observation_space = spaces.Box(
93
            low=-np.inf, high=np.inf, shape=(self.state_space,)
94
        )
95
        self.turbulence = 0
96
        self.episode = -1  # initialize so we can call reset
97
        self.episode_history = []
98
        self.printed_header = False
99
        self.cache_indicator_data = cache_indicator_data
100
        self.cached_data = None
101
        self.cash_penalty_proportion = cash_penalty_proportion
102
        if self.cache_indicator_data:
103
            print("caching data")
104
            self.cached_data = [
105
                self.get_date_vector(i) for i, _ in enumerate(self.dates)
106
            ]
107
            print("data cached!")
108

109
    def seed(self, seed=None):
110
        if seed is None:
111
            seed = int(round(time.time() * 1000))
112
        random.seed(seed)
113

114
    @property
115
    def current_step(self):
116
        return self.date_index - self.starting_point
117

118
    @property
119
    def cash_on_hand(self):
120
        # amount of cash held at current timestep
121
        return self.state_memory[-1][0]
122

123
    @property
124
    def holdings(self):
125
        # Quantity of shares held at current timestep
126
        return self.state_memory[-1][1 : len(self.assets) + 1]
127

128
    @property
129
    def closings(self):
130
        return np.array(self.get_date_vector(self.date_index, cols=["close"]))
131

132
    def reset(
133
        self,
134
        *,
135
        seed=None,
136
        options=None,
137
    ):
138
        self.seed()
139
        self.sum_trades = 0
140
        if self.random_start:
141
            starting_point = random.choice(range(int(len(self.dates) * 0.5)))
142
            self.starting_point = starting_point
143
        else:
144
            self.starting_point = 0
145
        self.date_index = self.starting_point
146
        self.turbulence = 0
147
        self.episode += 1
148
        self.actions_memory = []
149
        self.transaction_memory = []
150
        self.state_memory = []
151
        self.account_information = {
152
            "cash": [],
153
            "asset_value": [],
154
            "total_assets": [],
155
            "reward": [],
156
        }
157
        init_state = np.array(
158
            [self.initial_amount]
159
            + [0] * len(self.assets)
160
            + self.get_date_vector(self.date_index)
161
        )
162
        self.state_memory.append(init_state)
163
        return init_state
164

165
    def get_date_vector(self, date, cols=None):
166
        if (cols is None) and (self.cached_data is not None):
167
            return self.cached_data[date]
168
        else:
169
            date = self.dates[date]
170
            if cols is None:
171
                cols = self.daily_information_cols
172
            trunc_df = self.df.loc[[date]]
173
            v = []
174
            for a in self.assets:
175
                subset = trunc_df[trunc_df[self.stock_col] == a]
176
                v += subset.loc[date, cols].tolist()
177
            assert len(v) == len(self.assets) * len(cols)
178
            return v
179

180
    def return_terminal(self, reason="Last Date", reward=0):
181
        state = self.state_memory[-1]
182
        self.log_step(reason=reason, terminal_reward=reward)
183
        # Add outputs to logger interface
184
        gl_pct = self.account_information["total_assets"][-1] / self.initial_amount
185
        logger.record("environment/GainLoss_pct", (gl_pct - 1) * 100)
186
        logger.record(
187
            "environment/total_assets",
188
            int(self.account_information["total_assets"][-1]),
189
        )
190
        reward_pct = self.account_information["total_assets"][-1] / self.initial_amount
191
        logger.record("environment/total_reward_pct", (reward_pct - 1) * 100)
192
        logger.record("environment/total_trades", self.sum_trades)
193
        logger.record(
194
            "environment/avg_daily_trades",
195
            self.sum_trades / (self.current_step),
196
        )
197
        logger.record(
198
            "environment/avg_daily_trades_per_asset",
199
            self.sum_trades / (self.current_step) / len(self.assets),
200
        )
201
        logger.record("environment/completed_steps", self.current_step)
202
        logger.record(
203
            "environment/sum_rewards", np.sum(self.account_information["reward"])
204
        )
205
        logger.record(
206
            "environment/cash_proportion",
207
            self.account_information["cash"][-1]
208
            / self.account_information["total_assets"][-1],
209
        )
210
        return state, reward, True, {}
211

212
    def log_step(self, reason, terminal_reward=None):
213
        if terminal_reward is None:
214
            terminal_reward = self.account_information["reward"][-1]
215
        cash_pct = (
216
            self.account_information["cash"][-1]
217
            / self.account_information["total_assets"][-1]
218
        )
219
        gl_pct = self.account_information["total_assets"][-1] / self.initial_amount
220
        rec = [
221
            self.episode,
222
            self.date_index - self.starting_point,
223
            reason,
224
            f"{self.currency}{'{:0,.0f}'.format(float(self.account_information['cash'][-1]))}",
225
            f"{self.currency}{'{:0,.0f}'.format(float(self.account_information['total_assets'][-1]))}",
226
            f"{terminal_reward*100:0.5f}%",
227
            f"{(gl_pct - 1)*100:0.5f}%",
228
            f"{cash_pct*100:0.2f}%",
229
        ]
230
        self.episode_history.append(rec)
231
        print(self.template.format(*rec))
232

233
    def log_header(self):
234
        if self.printed_header is False:
235
            self.template = "{0:4}|{1:4}|{2:15}|{3:15}|{4:15}|{5:10}|{6:10}|{7:10}"  # column widths: 8, 10, 15, 7, 10
236
            print(
237
                self.template.format(
238
                    "EPISODE",
239
                    "STEPS",
240
                    "TERMINAL_REASON",
241
                    "CASH",
242
                    "TOT_ASSETS",
243
                    "TERMINAL_REWARD_unsc",
244
                    "GAINLOSS_PCT",
245
                    "CASH_PROPORTION",
246
                )
247
            )
248
            self.printed_header = True
249

250
    def get_reward(self):
251
        if self.current_step == 0:
252
            return 0
253
        else:
254
            assets = self.account_information["total_assets"][-1]
255
            cash = self.account_information["cash"][-1]
256
            cash_penalty = max(0, (assets * self.cash_penalty_proportion - cash))
257
            assets -= cash_penalty
258
            reward = (assets / self.initial_amount) - 1
259
            reward /= self.current_step
260
            return reward
261

262
    def get_transactions(self, actions):
263
        """
264
        This function takes in a raw 'action' from the model and makes it into realistic transactions
265
        This function includes logic for discretizing
266
        It also includes turbulence logic.
267
        """
268
        # record actions of the model
269
        self.actions_memory.append(actions)
270

271
        # multiply actions by the hmax value
272
        actions = actions * self.hmax
273

274
        # Do nothing for shares with zero value
275
        actions = np.where(self.closings > 0, actions, 0)
276

277
        # discretize optionally
278
        if self.discrete_actions:
279
            # convert into integer because we can't buy fraction of shares
280
            actions = actions // self.closings
281
            actions = actions.astype(int)
282
            # round down actions to the nearest multiplies of shares_increment
283
            actions = np.where(
284
                actions >= 0,
285
                (actions // self.shares_increment) * self.shares_increment,
286
                ((actions + self.shares_increment) // self.shares_increment)
287
                * self.shares_increment,
288
            )
289
        else:
290
            actions = actions / self.closings
291

292
        # can't sell more than we have
293
        actions = np.maximum(actions, -np.array(self.holdings))
294

295
        # deal with turbulence
296
        if self.turbulence_threshold is not None:
297
            # if turbulence goes over threshold, just clear out all positions
298
            if self.turbulence >= self.turbulence_threshold:
299
                actions = -(np.array(self.holdings))
300
                self.log_step(reason="TURBULENCE")
301

302
        return actions
303

304
    def step(self, actions):
305
        # let's just log what we're doing in terms of max actions at each step.
306
        self.sum_trades += np.sum(np.abs(actions))
307
        self.log_header()
308
        # print if it's time.
309
        if (self.current_step + 1) % self.print_verbosity == 0:
310
            self.log_step(reason="update")
311
        # if we're at the end
312
        if self.date_index == len(self.dates) - 1:
313
            # if we hit the end, set reward to total gains (or losses)
314
            return self.return_terminal(reward=self.get_reward())
315
        else:
316
            """
317
            First, we need to compute values of holdings, save these, and log everything.
318
            Then we can reward our model for its earnings.
319
            """
320
            # compute value of cash + assets
321
            begin_cash = self.cash_on_hand
322
            assert min(self.holdings) >= 0
323
            asset_value = np.dot(self.holdings, self.closings)
324
            # log the values of cash, assets, and total assets
325
            self.account_information["cash"].append(begin_cash)
326
            self.account_information["asset_value"].append(asset_value)
327
            self.account_information["total_assets"].append(begin_cash + asset_value)
328

329
            # compute reward once we've computed the value of things!
330
            reward = self.get_reward()
331
            self.account_information["reward"].append(reward)
332

333
            # Now, let's get down to business at hand.
334
            transactions = self.get_transactions(actions)
335

336
            # compute our proceeds from sells, and add to cash
337
            sells = -np.clip(transactions, -np.inf, 0)
338
            proceeds = np.dot(sells, self.closings)
339
            costs = proceeds * self.sell_cost_pct
340
            coh = begin_cash + proceeds
341
            # compute the cost of our buys
342
            buys = np.clip(transactions, 0, np.inf)
343
            spend = np.dot(buys, self.closings)
344
            costs += spend * self.buy_cost_pct
345
            # if we run out of cash...
346
            if (spend + costs) > coh:
347
                if self.patient:
348
                    # ... just don't buy anything until we got additional cash
349
                    self.log_step(reason="CASH SHORTAGE")
350
                    transactions = np.where(transactions > 0, 0, transactions)
351
                    spend = 0
352
                    costs = 0
353
                else:
354
                    # ... end the cycle and penalize
355
                    return self.return_terminal(
356
                        reason="CASH SHORTAGE", reward=self.get_reward()
357
                    )
358
            self.transaction_memory.append(
359
                transactions
360
            )  # capture what the model's could do
361
            # verify we didn't do anything impossible here
362
            assert (spend + costs) <= coh
363
            # update our holdings
364
            coh = coh - spend - costs
365
            holdings_updated = self.holdings + transactions
366
            self.date_index += 1
367
            if self.turbulence_threshold is not None:
368
                self.turbulence = self.get_date_vector(
369
                    self.date_index, cols=["turbulence"]
370
                )[0]
371
            # Update State
372
            state = (
373
                [coh] + list(holdings_updated) + self.get_date_vector(self.date_index)
374
            )
375
            self.state_memory.append(state)
376
            return state, reward, False, {}
377

378
    def get_sb_env(self):
379
        def get_self():
380
            return deepcopy(self)
381

382
        e = DummyVecEnv([get_self])
383
        obs = e.reset()
384
        return e, obs
385

386
    def get_multiproc_env(self, n=10):
387
        def get_self():
388
            return deepcopy(self)
389

390
        e = SubprocVecEnv([get_self for _ in range(n)], start_method="fork")
391
        obs = e.reset()
392
        return e, obs
393

394
    def save_asset_memory(self):
395
        if self.current_step == 0:
396
            return None
397
        else:
398
            self.account_information["date"] = self.dates[
399
                -len(self.account_information["cash"]) :
400
            ]
401
            return pd.DataFrame(self.account_information)
402

403
    def save_action_memory(self):
404
        if self.current_step == 0:
405
            return None
406
        else:
407
            return pd.DataFrame(
408
                {
409
                    "date": self.dates[-len(self.account_information["cash"]) :],
410
                    "actions": self.actions_memory,
411
                    "transactions": self.transaction_memory,
412
                }
413
            )
414

415
Product

Resources

Company