CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutSign UpSign In
huggingface

Real-time collaboration for Jupyter Notebooks, Linux Terminals, LaTeX, VS Code, R IDE, and more,
all in one place. Commercial Alternative to JupyterHub.

GitHub Repository: huggingface/notebooks
Path: blob/main/course/fr/chapter3/section2_pt.ipynb
Views: 2549
Kernel: Python 3

Préparer des données (PyTorch)

Installez les bibliothèques 🤗 Transformers et 🤗 Datasets pour exécuter ce notebook.

!pip install datasets transformers[sentencepiece]
import torch from transformers import AdamW, AutoTokenizer, AutoModelForSequenceClassification # Comme avant checkpoint = "camembert-base" tokenizer = AutoTokenizer.from_pretrained(checkpoint) model = AutoModelForSequenceClassification.from_pretrained(checkpoint) sequences = [ "J'ai attendu un cours d'HuggingFace toute ma vie.", "Je déteste tellement ça !"] batch = tokenizer(sequences, padding=True, truncation=True, return_tensors="pt") # C'est nouveau batch["labels"] = torch.tensor([1, 1]) optimizer = AdamW(model.parameters()) loss = model(**batch).loss loss.backward() optimizer.step()
from datasets import load_dataset raw_datasets = load_dataset("paws-x", "fr") raw_datasets
raw_train_dataset = raw_datasets["train"] raw_train_dataset[0]
raw_train_dataset.features
from transformers import AutoTokenizer checkpoint = "camembert-base" tokenizer = AutoTokenizer.from_pretrained(checkpoint) tokenized_sentences_1 = tokenizer(raw_datasets["train"]["sentence1"]) tokenized_sentences_2 = tokenizer(raw_datasets["train"]["sentence2"])
inputs = tokenizer("C'est la première phrase.", "C'est la deuxième.") inputs
tokenizer.convert_ids_to_tokens(inputs["input_ids"])
tokenized_dataset = tokenizer( raw_datasets["train"]["sentence1"], raw_datasets["train"]["sentence2"], padding=True, truncation=True, )
def tokenize_function(example): return tokenizer(example["sentence1"], example["sentence2"], truncation=True)
tokenized_datasets = raw_datasets.map(tokenize_function, batched=True) tokenized_datasets
from transformers import DataCollatorWithPadding data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
samples = tokenized_datasets["train"][:8] samples = {k: v for k, v in samples.items() if k not in ["idx", "sentence1", "sentence2"]} [len(x) for x in samples["input_ids"]]
batch = data_collator(samples) {k: v.shape for k, v in batch.items()}