Path: blob/master/deep_learning/tabular/bert_ctr/kdd2012_track2_preprocess_data.py
1480 views
"""1preprocess kdd 2012 track 2 dataset, and create train/test split23Prerequisite, download and unzip track2.zip from4https://www.kaggle.com/competitions/kddcup2012-track2/overview5"""6import numpy as np7import pandas as pd8from sklearn.model_selection import train_test_split9from sklearn.feature_extraction.text import TfidfVectorizer10from sklearn.preprocessing import OrdinalEncoder, MinMaxScaler111213def main():14names = [15"click",16"impression",17"display_url",18"ad_id",19"advertiser_id",20"depth",21"position",22"query_id",23"keyword_id",24"title_id",25"description_id",26"user_id"27]2829df = pd.read_csv(30"track2/training.txt",31sep="\t",32header=None,33names=names,34# work with sub-sample of the raw data35nrows=100000036)37df["label"] = np.where(df["click"] > 0, 1, 0)38print(df.shape)39print(df.head())4041df_title, df_query = preprocess_df_entity_token_id()42df_user = pd.read_csv(43"track2/userid_profile.txt",44sep="\t",45header=None,46names=["user_id", "gender", "age"]47)4849df_enriched = (50df51.merge(df_title, on="title_id")52.merge(df_query, on="query_id")53.merge(df_user, on="user_id", how="left")54)55df_final = preprocess_tabular_features(df_enriched)56print(df_final.shape)57print(df_final.head())58print(df_final["label"].value_counts())59df_train, df_test = train_test_split(df_final, test_size=0.1, random_state=1234, stratify=df["label"])6061df_train.to_parquet("track2_processed_train.parquet", index=False)62df_test.to_parquet("track2_processed_test.parquet", index=False)636465def preprocess_tabular_features(df: pd.DataFrame):66"""Ordinal encoder categorical features as well as scale numerical features"""67numerical_features = ["depth"]68categorical_features = ["gender", "age", "advertiser_id", "user_id"]6970df[numerical_features] = df[numerical_features].fillna(0)71df[categorical_features] = df[categorical_features].fillna(-1)7273ordinal_encoder = OrdinalEncoder(min_frequency=30)74df[categorical_features] = ordinal_encoder.fit_transform(df[categorical_features])7576min_max_scaler = MinMaxScaler(feature_range=(0, 1))77df[numerical_features] = min_max_scaler.fit_transform(df[numerical_features])78return df798081def preprocess_df_entity_token_id():82"""create vocabulary id from raw token id"""83df_title = pd.read_csv(84"track2/titleid_tokensid.txt",85sep="\t",86header=None,87names=["entity_id", "tokens_id"]88)89df_title["entity"] = "title"9091df_query = pd.read_csv(92"track2/queryid_tokensid.txt",93sep="\t",94header=None,95names=["entity_id", "tokens_id"]96)97df_query["entity"] = "query"9899df_entity_token_id = pd.concat([df_title, df_query], axis=0).reset_index(drop=True)100print(df_entity_token_id.shape)101print(df_entity_token_id.head())102103# use tf-idf to create the token to vocabulary mapping104# The default regexp selects tokens of 2 or more alphanumeric characters105# (punctuation is completely ignored and always treated as a token separator),106# here we modified it to 1, else single digit tokens would get skipped,107# we'll need to provide additional filtering arguments like min and max df, else108# the word level vocabulary would become extremely big109token_pattern = r"(?u)\b\w+\b"110tfidf_vect = TfidfVectorizer(token_pattern=token_pattern, min_df=10, max_df=0.5)111tfidf_vect.fit(df_entity_token_id["tokens_id"])112# original vocab size 1049677113vocab_size = len(tfidf_vect.vocabulary_)114print("vocab size: ", vocab_size)115116vocab_id = []117for token_id in df_entity_token_id["tokens_id"]:118vocab = [tfidf_vect.vocabulary_.get(token, vocab_size) for token in token_id.split("|")]119vocab_id.append(vocab)120121df_entity_token_id["vocab_id"] = vocab_id122123df_title = df_entity_token_id.loc[df_entity_token_id["entity"] == "title", ["entity_id", "vocab_id"]]124df_title = df_title.rename(columns={"entity_id": "title_id", "vocab_id": "tokenized_title"})125df_query = df_entity_token_id.loc[df_entity_token_id["entity"] == "query", ["entity_id", "vocab_id"]]126df_query = df_query.rename(columns={"entity_id": "query_id", "vocab_id": "tokenized_query"})127return df_title, df_query128129130if __name__ == "__main__":131main()132133