CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutSign UpSign In
huggingface

Real-time collaboration for Jupyter Notebooks, Linux Terminals, LaTeX, VS Code, R IDE, and more,
all in one place. Commercial Alternative to JupyterHub.

GitHub Repository: huggingface/notebooks
Path: blob/main/course/fr/chapter3/section2_tf.ipynb
Views: 2548
Kernel: Python 3

Préparer des données (TensorFlow)

Installez les bibliothèques 🤗 Transformers et 🤗 Datasets pour exécuter ce notebook.

!pip install datasets transformers[sentencepiece]
import tensorflow as tf import numpy as np from transformers import AutoTokenizer, TFAutoModelForSequenceClassification # Same as before checkpoint = "camembert-base" tokenizer = AutoTokenizer.from_pretrained(checkpoint) model = TFAutoModelForSequenceClassification.from_pretrained(checkpoint) sequences = [ "J'ai attendu un cours d'HuggingFace toute ma vie.", "Je déteste tellement ça !"] batch = dict(tokenizer(sequences, padding=True, truncation=True, return_tensors="tf")) # This is new model.compile(optimizer="adam", loss="sparse_categorical_crossentropy") labels = tf.convert_to_tensor([1, 1]) model.train_on_batch(batch, labels)
from datasets import load_dataset raw_datasets = load_dataset("paws-x", "fr") raw_datasets
raw_train_dataset = raw_datasets["train"] raw_train_dataset[0]
raw_train_dataset.features
from transformers import AutoTokenizer checkpoint = "camembert-base" tokenizer = AutoTokenizer.from_pretrained(checkpoint) tokenized_sentences_1 = tokenizer(raw_datasets["train"]["sentence1"]) tokenized_sentences_2 = tokenizer(raw_datasets["train"]["sentence2"])
inputs = tokenizer("C'est la première phrase.", "C'est la deuxième.") inputs
tokenizer.convert_ids_to_tokens(inputs["input_ids"])
tokenized_dataset = tokenizer( raw_datasets["train"]["sentence1"], raw_datasets["train"]["sentence2"], padding=True, truncation=True, )
def tokenize_function(example): return tokenizer(example["sentence1"], example["sentence2"], truncation=True)
tokenized_datasets = raw_datasets.map(tokenize_function, batched=True) tokenized_datasets
from transformers import DataCollatorWithPadding data_collator = DataCollatorWithPadding(tokenizer=tokenizer, return_tensors="tf")
samples = tokenized_datasets["train"][:8] samples = {k: v for k, v in samples.items() if k not in ["idx", "sentence1", "sentence2"]} [len(x) for x in samples["input_ids"]]
batch = data_collator(samples) {k: v.shape for k, v in batch.items()}
tf_train_dataset = tokenized_datasets["train"].to_tf_dataset( columns=["attention_mask", "input_ids", "token_type_ids"], label_cols=["labels"], shuffle=True, collate_fn=data_collator, batch_size=8, ) tf_validation_dataset = tokenized_datasets["validation"].to_tf_dataset( columns=["attention_mask", "input_ids", "token_type_ids"], label_cols=["labels"], shuffle=False, collate_fn=data_collator, batch_size=8, )