CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutSign UpSign In
huggingface

Real-time collaboration for Jupyter Notebooks, Linux Terminals, LaTeX, VS Code, R IDE, and more,
all in one place. Commercial Alternative to JupyterHub.

GitHub Repository: huggingface/notebooks
Path: blob/main/course/fr/chapter3/section3_tf.ipynb
Views: 2548
Kernel: Python 3

Finetuner un modèle avec Keras

Installez les bibliothèques 🤗 Transformers et 🤗 Datasets pour exécuter ce notebook.

!pip install datasets transformers[sentencepiece]
from datasets import load_dataset from transformers import AutoTokenizer, DataCollatorWithPadding import numpy as np raw_datasets = load_dataset("paws-x", "fr") checkpoint = "camembert-base" tokenizer = AutoTokenizer.from_pretrained(checkpoint) def tokenize_function(example): return tokenizer(example["sentence1"], example["sentence2"], truncation=True) tokenized_datasets = raw_datasets.map(tokenize_function, batched=True) data_collator = DataCollatorWithPadding(tokenizer=tokenizer, return_tensors="tf") tf_train_dataset = tokenized_datasets["train"].to_tf_dataset( columns=["attention_mask", "input_ids", "token_type_ids"], label_cols=["labels"], shuffle=True, collate_fn=data_collator, batch_size=8, ) tf_validation_dataset = tokenized_datasets["validation"].to_tf_dataset( columns=["attention_mask", "input_ids", "token_type_ids"], label_cols=["labels"], shuffle=False, collate_fn=data_collator, batch_size=8, )
from transformers import TFAutoModelForSequenceClassification model = TFAutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)
from tensorflow.keras.losses import SparseCategoricalCrossentropy model.compile( optimizer="adam", loss=SparseCategoricalCrossentropy(from_logits=True), metrics=["accuracy"], ) model.fit( tf_train_dataset, validation_data=tf_validation_dataset, )
from tensorflow.keras.optimizers.schedules import PolynomialDecay batch_size = 8 num_epochs = 3 # Le nombre d'étapes d'entraînement est le nombre d'échantillons dans le jeu de données, divisé par la taille du batch, puis multiplié par le nombre total d'époques. # Notez que le jeu de données tf_train_dataset est ici un batch de données tf.data.Dataset # et non le jeu de données original Hugging Face, donc sa len() est déjà num_samples // batch_size. num_train_steps = len(tf_train_dataset) * num_epochs lr_scheduler = PolynomialDecay( initial_learning_rate=5e-5, end_learning_rate=0.0, decay_steps=num_train_steps ) from tensorflow.keras.optimizers import Adam opt = Adam(learning_rate=lr_scheduler)
import tensorflow as tf model = TFAutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2) loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True) model.compile(optimizer=opt, loss=loss, metrics=["accuracy"])
model.fit(tf_train_dataset, validation_data=tf_validation_dataset, epochs=3)
preds = model.predict(tf_validation_dataset)["logits"]
class_preds = np.argmax(preds, axis=1) print(preds.shape, class_preds.shape)
from datasets import load_metric metric = load_metric("glue", "mrpc") metric.compute(predictions=class_preds, references=raw_datasets["validation"]["label"])