CoCalc -- split.py

GitHub Repository: ethen8181/machine-learning
Path: blob/master/deep_learning/multi_label/fasttext_module/split.py
¹⁴⁸⁷ views
1
import random
2
from typing import Tuple
3

4

5
__all__ = ['train_test_split_file']
6

7

8
def train_test_split_file(input_path: str,
9
                          output_path_train: str,
10
                          output_path_test: str,
11
                          test_size: float=0.1,
12
                          random_state: int=1234,
13
                          encoding: str='utf-8') -> Tuple[int, int]:
14
    """
15
    Perform train and test split on a text file without reading the
16
    whole file into memory.
17

18
    Parameters
19
    ----------
20
    input_path : str
21
        Path to the original full text file.
22

23
    output_path_train : str
24
        Path of the train split.
25

26
    output_path_test : str
27
        Path of the test split.
28

29
    test_size : float, 0.0 ~ 1.0, default 0.1
30
        Size of the test split.
31

32
    random_state : int, default 1234
33
        Seed for the random split.
34

35
    encoding : str, default 'utf-8'
36
        Encoding for reading and writing the file.
37

38
    Returns
39
    -------
40
    count_train, count_test : int
41
        Number of record in the training and test set.
42
    """
43
    random.seed(random_state)
44

45
    # accumulate the number of records in the training and test set
46
    count_train = 0
47
    count_test = 0
48
    train_range = 1 - test_size
49

50
    with open(input_path, encoding=encoding) as f_in, \
51
         open(output_path_train, 'w', encoding=encoding) as f_train, \
52
         open(output_path_test, 'w', encoding=encoding) as f_test:
53

54
        for line in f_in:
55
            random_num = random.random()
56
            if random_num < train_range:
57
                f_train.write(line)
58
                count_train += 1
59
            else:
60
                f_test.write(line)
61
                count_test += 1
62

63
    return count_train, count_test
64

65
Product

Resources

Company