Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
ethen8181
GitHub Repository: ethen8181/machine-learning
Path: blob/master/deep_learning/multi_label/fasttext_module/split.py
1487 views
1
import random
2
from typing import Tuple
3
4
5
__all__ = ['train_test_split_file']
6
7
8
def train_test_split_file(input_path: str,
9
output_path_train: str,
10
output_path_test: str,
11
test_size: float=0.1,
12
random_state: int=1234,
13
encoding: str='utf-8') -> Tuple[int, int]:
14
"""
15
Perform train and test split on a text file without reading the
16
whole file into memory.
17
18
Parameters
19
----------
20
input_path : str
21
Path to the original full text file.
22
23
output_path_train : str
24
Path of the train split.
25
26
output_path_test : str
27
Path of the test split.
28
29
test_size : float, 0.0 ~ 1.0, default 0.1
30
Size of the test split.
31
32
random_state : int, default 1234
33
Seed for the random split.
34
35
encoding : str, default 'utf-8'
36
Encoding for reading and writing the file.
37
38
Returns
39
-------
40
count_train, count_test : int
41
Number of record in the training and test set.
42
"""
43
random.seed(random_state)
44
45
# accumulate the number of records in the training and test set
46
count_train = 0
47
count_test = 0
48
train_range = 1 - test_size
49
50
with open(input_path, encoding=encoding) as f_in, \
51
open(output_path_train, 'w', encoding=encoding) as f_train, \
52
open(output_path_test, 'w', encoding=encoding) as f_test:
53
54
for line in f_in:
55
random_num = random.random()
56
if random_num < train_range:
57
f_train.write(line)
58
count_train += 1
59
else:
60
f_test.write(line)
61
count_test += 1
62
63
return count_train, count_test
64
65