Real-time collaboration for Jupyter Notebooks, Linux Terminals, LaTeX, VS Code, R IDE, and more,
all in one place. Commercial Alternative to JupyterHub.
Real-time collaboration for Jupyter Notebooks, Linux Terminals, LaTeX, VS Code, R IDE, and more,
all in one place. Commercial Alternative to JupyterHub.
Path: blob/main/course/fr/chapter7/section3_tf.ipynb
Views: 2549
Kernel: Python 3
Finetuner un modèle de language masqué (TensorFlow)
Installez les bibliothèques 🤗 Datasets et 🤗 Transformers pour exécuter ce notebook.
In [ ]:
!pip install datasets transformers[sentencepiece] !apt install git-lfs
Vous aurez besoin de configurer git, adaptez votre email et votre nom dans la cellule suivante.
In [ ]:
Vous devrez également être connecté au Hub d'Hugging Face. Exécutez ce qui suit et entrez vos informations d'identification.
In [ ]:
from huggingface_hub import notebook_login notebook_login()
In [ ]:
from transformers import TFAutoModelForMaskedLM model_checkpoint = "camembert-base" model = TFAutoModelForMaskedLM.from_pretrained(model_checkpoint)
In [ ]:
model.summary()
In [ ]:
text = "C'est une grande <mask>."
In [ ]:
from transformers import AutoTokenizer tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
In [ ]:
inputs = tokenizer(text, return_tensors="np") inputs
In [ ]:
import numpy as np import tensorflow as tf inputs = tokenizer(text, return_tensors="np") token_logits = model(**inputs).logits # Trouver l'emplacement du [MASK] et extraire ses logits mask_token_index = np.argwhere(inputs["input_ids"] == tokenizer.mask_token_id)[0, 1] mask_token_logits = token_logits[0, mask_token_index, :] # Choisir les <mask> candidats avec les logits les plus élevés # Nous annulons le tableau avant argsort pour obtenir le plus grand, pas le plus petit, logits top_5_tokens = np.argsort(-mask_token_logits)[:5].tolist() for token in top_5_tokens: print(f">>> {text.replace(tokenizer.mask_token, tokenizer.decode([token]))}")
In [ ]:
from datasets import load_dataset imdb_dataset = load_dataset("allocine") imdb_dataset
In [ ]:
sample = imdb_dataset["train"].shuffle(seed=42).select(range(3)) for row in sample: print(f"\n'>>> Review: {row['review']}'") print(f"'>>> Label: {row['label']}'")
In [ ]:
def tokenize_function(examples): result = tokenizer(examples["review"]) if tokenizer.is_fast: result["word_ids"] = [result.word_ids(i) for i in range(len(result["input_ids"]))] return result # Utilisez batched=True pour activer le multithreading rapide ! tokenized_datasets = imdb_dataset.map( tokenize_function, batched=True, remove_columns=["review", "label"] ) tokenized_datasets
In [ ]:
tokenizer.model_max_length
In [ ]:
chunk_size = 128
In [ ]:
# Le découpage produit une liste de listes pour chaque caractéristique tokenized_samples = tokenized_datasets["train"][:3] for idx, sample in enumerate(tokenized_samples["input_ids"]): print(f"'>>> Review {idx} length: {len(sample)}'")
In [ ]:
concatenated_examples = { k: sum(tokenized_samples[k], []) for k in tokenized_samples.keys() } total_length = len(concatenated_examples["input_ids"]) print(f"'>>> Concatenated reviews length: {total_length}'")
In [ ]:
chunks = { k: [t[i : i + chunk_size] for i in range(0, total_length, chunk_size)] for k, t in concatenated_examples.items() } for chunk in chunks["input_ids"]: print(f"'>>> Chunk length: {len(chunk)}'")
In [ ]:
def group_texts(examples): # Concaténation de tous les textes concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()} # Calculer la longueur des textes concaténés total_length = len(concatenated_examples[list(examples.keys())[0]]) # Nous laissons tomber le dernier morceau s'il est plus petit que chunk_size total_length = (total_length // chunk_size) * chunk_size # Fractionnement par morceaux de max_len result = { k: [t[i : i + chunk_size] for i in range(0, total_length, chunk_size)] for k, t in concatenated_examples.items() } # Créer une nouvelle colonne d'étiquettes result["labels"] = result["input_ids"].copy() return result
In [ ]:
lm_datasets = tokenized_datasets.map(group_texts, batched=True) lm_datasets
In [ ]:
tokenizer.decode(lm_datasets["train"][1]["input_ids"])
In [ ]:
from transformers import DataCollatorForLanguageModeling data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm_probability=0.15)
In [ ]:
samples = [lm_datasets["train"][i] for i in range(2)] for sample in samples: _ = sample.pop("word_ids") for chunk in data_collator(samples)["input_ids"]: print(f"\n'>>> {tokenizer.decode(chunk)}'")
In [ ]:
import collections import numpy as np from transformers.data.data_collator import tf_default_data_collator wwm_probability = 0.2 def whole_word_masking_data_collator(features): for feature in features: word_ids = feature.pop("word_ids") # Création d'une correspondance entre les mots et les indices des tokens correspondants mapping = collections.defaultdict(list) current_word_index = -1 current_word = None for idx, word_id in enumerate(word_ids): if word_id is not None: if word_id != current_word: current_word = word_id current_word_index += 1 mapping[current_word_index].append(idx) # Masquer des mots de façon aléatoire mask = np.random.binomial(1, wwm_probability, (len(mapping),)) input_ids = feature["input_ids"] labels = feature["labels"] new_labels = [-100] * len(labels) for word_id in np.where(mask)[0]: word_id = word_id.item() for idx in mapping[word_id]: new_labels[idx] = labels[idx] input_ids[idx] = tokenizer.mask_token_id feature["labels"] = new_labels return tf_default_data_collator(features)
In [ ]:
samples = [lm_datasets["train"][i] for i in range(2)] batch = whole_word_masking_data_collator(samples) for chunk in batch["input_ids"]: print(f"\n'>>> {tokenizer.decode(chunk)}'")
In [ ]:
train_size = 10_000 test_size = int(0.1 * train_size) downsampled_dataset = lm_datasets["train"].train_test_split( train_size=train_size, test_size=test_size, seed=42 ) downsampled_dataset
In [ ]:
from huggingface_hub import notebook_login notebook_login()
In [ ]:
tf_train_dataset = model.prepare_tf_dataset( downsampled_dataset["train"], collate_fn=data_collator, shuffle=True, batch_size=32, ) tf_eval_dataset = model.prepare_tf_dataset( downsampled_dataset["test"], collate_fn=data_collator, shuffle=False, batch_size=32, )
In [ ]:
from transformers import create_optimizer from transformers.keras_callbacks import PushToHubCallback import tensorflow as tf num_train_steps = len(tf_train_dataset) optimizer, schedule = create_optimizer( init_lr=2e-5, num_warmup_steps=1_000, num_train_steps=num_train_steps, weight_decay_rate=0.01, ) model.compile(optimizer=optimizer) # Entraîner en mixed-precision float16 tf.keras.mixed_precision.set_global_policy("mixed_float16") callback = PushToHubCallback( output_dir=f"{model_name}-finetuned-allocine", tokenizer=tokenizer )
In [ ]:
import math eval_loss = model.evaluate(tf_eval_dataset) print(f"Perplexity: {math.exp(eval_loss):.2f}")
In [ ]:
model.fit(tf_train_dataset, validation_data=tf_eval_dataset, callbacks=[callback])
In [ ]:
eval_loss = model.evaluate(tf_eval_dataset) print(f"Perplexity: {math.exp(eval_loss):.2f}")
In [ ]:
from transformers import pipeline mask_filler = pipeline( "fill-mask", model="camembert-base-finetuned-allocine", framework="tf" )
In [ ]:
preds = mask_filler(text) for pred in preds: print(f">>> {pred['sequence']}")