CoCalc -- kdd2012_track2_preprocess

GitHub Repository: ethen8181/machine-learning
Path: blob/master/deep_learning/tabular/bert_ctr/kdd2012_track2_preprocess_data.py
¹⁴⁸⁰ views
1
"""
2
preprocess kdd 2012 track 2 dataset, and create train/test split
3

4
Prerequisite, download and unzip track2.zip from
5
https://www.kaggle.com/competitions/kddcup2012-track2/overview
6
"""
7
import numpy as np
8
import pandas as pd
9
from sklearn.model_selection import train_test_split
10
from sklearn.feature_extraction.text import TfidfVectorizer
11
from sklearn.preprocessing import OrdinalEncoder, MinMaxScaler
12

13

14
def main():
15
    names = [
16
        "click",
17
        "impression",
18
        "display_url",
19
        "ad_id",
20
        "advertiser_id",
21
        "depth",
22
        "position",
23
        "query_id",
24
        "keyword_id",
25
        "title_id",
26
        "description_id",
27
        "user_id"
28
    ]
29

30
    df = pd.read_csv(
31
        "track2/training.txt",
32
        sep="\t",
33
        header=None,
34
        names=names,
35
        # work with sub-sample of the raw data
36
        nrows=1000000
37
    )
38
    df["label"] = np.where(df["click"] > 0, 1, 0)
39
    print(df.shape)
40
    print(df.head())
41

42
    df_title, df_query = preprocess_df_entity_token_id()
43
    df_user = pd.read_csv(
44
        "track2/userid_profile.txt",
45
        sep="\t",
46
        header=None,
47
        names=["user_id", "gender", "age"]
48
    )
49

50
    df_enriched = (
51
        df
52
            .merge(df_title, on="title_id")
53
            .merge(df_query, on="query_id")
54
            .merge(df_user, on="user_id", how="left")
55
    )
56
    df_final = preprocess_tabular_features(df_enriched)
57
    print(df_final.shape)
58
    print(df_final.head())
59
    print(df_final["label"].value_counts())
60
    df_train, df_test = train_test_split(df_final, test_size=0.1, random_state=1234, stratify=df["label"])
61

62
    df_train.to_parquet("track2_processed_train.parquet", index=False)
63
    df_test.to_parquet("track2_processed_test.parquet", index=False)
64

65

66
def preprocess_tabular_features(df: pd.DataFrame):
67
    """Ordinal encoder categorical features as well as scale numerical features"""
68
    numerical_features = ["depth"]
69
    categorical_features = ["gender", "age", "advertiser_id", "user_id"]
70

71
    df[numerical_features] = df[numerical_features].fillna(0)
72
    df[categorical_features] = df[categorical_features].fillna(-1)
73

74
    ordinal_encoder = OrdinalEncoder(min_frequency=30)
75
    df[categorical_features] = ordinal_encoder.fit_transform(df[categorical_features])
76

77
    min_max_scaler = MinMaxScaler(feature_range=(0, 1))
78
    df[numerical_features] = min_max_scaler.fit_transform(df[numerical_features])
79
    return df
80

81

82
def preprocess_df_entity_token_id():
83
    """create vocabulary id from raw token id"""
84
    df_title = pd.read_csv(
85
        "track2/titleid_tokensid.txt",
86
        sep="\t",
87
        header=None,
88
        names=["entity_id", "tokens_id"]
89
    )
90
    df_title["entity"] = "title"
91

92
    df_query = pd.read_csv(
93
        "track2/queryid_tokensid.txt",
94
        sep="\t",
95
        header=None,
96
        names=["entity_id", "tokens_id"]
97
    )
98
    df_query["entity"] = "query"
99

100
    df_entity_token_id = pd.concat([df_title, df_query], axis=0).reset_index(drop=True)
101
    print(df_entity_token_id.shape)
102
    print(df_entity_token_id.head())
103

104
    # use tf-idf to create the token to vocabulary mapping
105
    # The default regexp selects tokens of 2 or more alphanumeric characters
106
    # (punctuation is completely ignored and always treated as a token separator),
107
    # here we modified it to 1, else single digit tokens would get skipped,
108
    # we'll need to provide additional filtering arguments like min and max df, else
109
    # the word level vocabulary would become extremely big
110
    token_pattern = r"(?u)\b\w+\b"
111
    tfidf_vect = TfidfVectorizer(token_pattern=token_pattern, min_df=10, max_df=0.5)
112
    tfidf_vect.fit(df_entity_token_id["tokens_id"])
113
    # original vocab size 1049677
114
    vocab_size = len(tfidf_vect.vocabulary_)
115
    print("vocab size: ", vocab_size)
116

117
    vocab_id = []
118
    for token_id in df_entity_token_id["tokens_id"]:
119
        vocab = [tfidf_vect.vocabulary_.get(token, vocab_size) for token in token_id.split("|")]
120
        vocab_id.append(vocab)
121

122
    df_entity_token_id["vocab_id"] = vocab_id
123

124
    df_title = df_entity_token_id.loc[df_entity_token_id["entity"] == "title", ["entity_id", "vocab_id"]]
125
    df_title = df_title.rename(columns={"entity_id": "title_id", "vocab_id": "tokenized_title"})
126
    df_query = df_entity_token_id.loc[df_entity_token_id["entity"] == "query", ["entity_id", "vocab_id"]]
127
    df_query = df_query.rename(columns={"entity_id": "query_id", "vocab_id": "tokenized_query"})
128
    return df_title, df_query
129

130

131
if __name__ == "__main__":
132
    main()
133
Product

Resources

Company