Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
ethen8181
GitHub Repository: ethen8181/machine-learning
Path: blob/master/deep_learning/tabular/bert_ctr/kdd2012_track2_preprocess_data.py
1480 views
1
"""
2
preprocess kdd 2012 track 2 dataset, and create train/test split
3
4
Prerequisite, download and unzip track2.zip from
5
https://www.kaggle.com/competitions/kddcup2012-track2/overview
6
"""
7
import numpy as np
8
import pandas as pd
9
from sklearn.model_selection import train_test_split
10
from sklearn.feature_extraction.text import TfidfVectorizer
11
from sklearn.preprocessing import OrdinalEncoder, MinMaxScaler
12
13
14
def main():
15
names = [
16
"click",
17
"impression",
18
"display_url",
19
"ad_id",
20
"advertiser_id",
21
"depth",
22
"position",
23
"query_id",
24
"keyword_id",
25
"title_id",
26
"description_id",
27
"user_id"
28
]
29
30
df = pd.read_csv(
31
"track2/training.txt",
32
sep="\t",
33
header=None,
34
names=names,
35
# work with sub-sample of the raw data
36
nrows=1000000
37
)
38
df["label"] = np.where(df["click"] > 0, 1, 0)
39
print(df.shape)
40
print(df.head())
41
42
df_title, df_query = preprocess_df_entity_token_id()
43
df_user = pd.read_csv(
44
"track2/userid_profile.txt",
45
sep="\t",
46
header=None,
47
names=["user_id", "gender", "age"]
48
)
49
50
df_enriched = (
51
df
52
.merge(df_title, on="title_id")
53
.merge(df_query, on="query_id")
54
.merge(df_user, on="user_id", how="left")
55
)
56
df_final = preprocess_tabular_features(df_enriched)
57
print(df_final.shape)
58
print(df_final.head())
59
print(df_final["label"].value_counts())
60
df_train, df_test = train_test_split(df_final, test_size=0.1, random_state=1234, stratify=df["label"])
61
62
df_train.to_parquet("track2_processed_train.parquet", index=False)
63
df_test.to_parquet("track2_processed_test.parquet", index=False)
64
65
66
def preprocess_tabular_features(df: pd.DataFrame):
67
"""Ordinal encoder categorical features as well as scale numerical features"""
68
numerical_features = ["depth"]
69
categorical_features = ["gender", "age", "advertiser_id", "user_id"]
70
71
df[numerical_features] = df[numerical_features].fillna(0)
72
df[categorical_features] = df[categorical_features].fillna(-1)
73
74
ordinal_encoder = OrdinalEncoder(min_frequency=30)
75
df[categorical_features] = ordinal_encoder.fit_transform(df[categorical_features])
76
77
min_max_scaler = MinMaxScaler(feature_range=(0, 1))
78
df[numerical_features] = min_max_scaler.fit_transform(df[numerical_features])
79
return df
80
81
82
def preprocess_df_entity_token_id():
83
"""create vocabulary id from raw token id"""
84
df_title = pd.read_csv(
85
"track2/titleid_tokensid.txt",
86
sep="\t",
87
header=None,
88
names=["entity_id", "tokens_id"]
89
)
90
df_title["entity"] = "title"
91
92
df_query = pd.read_csv(
93
"track2/queryid_tokensid.txt",
94
sep="\t",
95
header=None,
96
names=["entity_id", "tokens_id"]
97
)
98
df_query["entity"] = "query"
99
100
df_entity_token_id = pd.concat([df_title, df_query], axis=0).reset_index(drop=True)
101
print(df_entity_token_id.shape)
102
print(df_entity_token_id.head())
103
104
# use tf-idf to create the token to vocabulary mapping
105
# The default regexp selects tokens of 2 or more alphanumeric characters
106
# (punctuation is completely ignored and always treated as a token separator),
107
# here we modified it to 1, else single digit tokens would get skipped,
108
# we'll need to provide additional filtering arguments like min and max df, else
109
# the word level vocabulary would become extremely big
110
token_pattern = r"(?u)\b\w+\b"
111
tfidf_vect = TfidfVectorizer(token_pattern=token_pattern, min_df=10, max_df=0.5)
112
tfidf_vect.fit(df_entity_token_id["tokens_id"])
113
# original vocab size 1049677
114
vocab_size = len(tfidf_vect.vocabulary_)
115
print("vocab size: ", vocab_size)
116
117
vocab_id = []
118
for token_id in df_entity_token_id["tokens_id"]:
119
vocab = [tfidf_vect.vocabulary_.get(token, vocab_size) for token in token_id.split("|")]
120
vocab_id.append(vocab)
121
122
df_entity_token_id["vocab_id"] = vocab_id
123
124
df_title = df_entity_token_id.loc[df_entity_token_id["entity"] == "title", ["entity_id", "vocab_id"]]
125
df_title = df_title.rename(columns={"entity_id": "title_id", "vocab_id": "tokenized_title"})
126
df_query = df_entity_token_id.loc[df_entity_token_id["entity"] == "query", ["entity_id", "vocab_id"]]
127
df_query = df_query.rename(columns={"entity_id": "query_id", "vocab_id": "tokenized_query"})
128
return df_title, df_query
129
130
131
if __name__ == "__main__":
132
main()
133