CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutSign UpSign In
huggingface

Real-time collaboration for Jupyter Notebooks, Linux Terminals, LaTeX, VS Code, R IDE, and more,
all in one place. Commercial Alternative to JupyterHub.

GitHub Repository: huggingface/notebooks
Path: blob/main/course/videos/dynamic_padding.ipynb
Views: 2542
Kernel: Unknown Kernel

This notebook regroups the code sample of the video below, which is a part of the Hugging Face course.

#@title from IPython.display import HTML HTML('<iframe width="560" height="315" src="https://www.youtube.com/embed/7q5NyFT8REg?rel=0&amp;controls=0&amp;showinfo=0" frameborder="0" allowfullscreen></iframe>')

Install the Transformers and Datasets libraries to run this notebook.

! pip install datasets transformers[sentencepiece]
from datasets import load_dataset from transformers import AutoTokenizer raw_datasets = load_dataset("glue", "mrpc") checkpoint = "bert-base-cased" tokenizer = AutoTokenizer.from_pretrained(checkpoint) def tokenize_function(examples): return tokenizer( examples["sentence1"], examples["sentence2"], padding="max_length", truncation=True, max_length=128 ) tokenized_datasets = raw_datasets.map(tokenize_function, batched=True) tokenized_datasets = tokenized_datasets.remove_columns(["idx", "sentence1", "sentence2"]) tokenized_datasets = tokenized_datasets.rename_column("label", "labels") tokenized_datasets = tokenized_datasets.with_format("torch")
Reusing dataset glue (/home/sgugger/.cache/huggingface/datasets/glue/mrpc/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad) Loading cached processed dataset at /home/sgugger/.cache/huggingface/datasets/glue/mrpc/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad/cache-2b2682faffe74c3f.arrow Loading cached processed dataset at /home/sgugger/.cache/huggingface/datasets/glue/mrpc/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad/cache-78d79fc323f0156c.arrow Loading cached processed dataset at /home/sgugger/.cache/huggingface/datasets/glue/mrpc/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad/cache-801914374fb3c6ca.arrow
from torch.utils.data import DataLoader train_dataloader = DataLoader(tokenized_datasets["train"], batch_size=16, shuffle=True) for step, batch in enumerate(train_dataloader): print(batch["input_ids"].shape) if step > 5: break
torch.Size([16, 128]) torch.Size([16, 128]) torch.Size([16, 128]) torch.Size([16, 128]) torch.Size([16, 128]) torch.Size([16, 128]) torch.Size([16, 128])
from datasets import load_dataset from transformers import AutoTokenizer raw_datasets = load_dataset("glue", "mrpc") checkpoint = "bert-base-cased" tokenizer = AutoTokenizer.from_pretrained(checkpoint) def tokenize_function(examples): return tokenizer(examples["sentence1"], examples["sentence2"], truncation=True) tokenized_datasets = raw_datasets.map(tokenize_function, batched=True) tokenized_datasets = tokenized_datasets.remove_columns(["idx", "sentence1", "sentence2"]) tokenized_datasets = tokenized_datasets.rename_column("label", "labels") tokenized_datasets = tokenized_datasets.with_format("torch")
Reusing dataset glue (/home/sgugger/.cache/huggingface/datasets/glue/mrpc/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad) Loading cached processed dataset at /home/sgugger/.cache/huggingface/datasets/glue/mrpc/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad/cache-8174fd92eed0af98.arrow Loading cached processed dataset at /home/sgugger/.cache/huggingface/datasets/glue/mrpc/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad/cache-8c99fb059544bc96.arrow Loading cached processed dataset at /home/sgugger/.cache/huggingface/datasets/glue/mrpc/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad/cache-e625eb72bcf1ae1f.arrow
from torch.utils.data import DataLoader from transformers import DataCollatorWithPadding data_collator = DataCollatorWithPadding(tokenizer) train_dataloader = DataLoader( tokenized_datasets["train"], batch_size=16, shuffle=True, collate_fn=data_collator ) for step, batch in enumerate(train_dataloader): print(batch["input_ids"].shape) if step > 5: break
torch.Size([16, 83]) torch.Size([16, 75]) torch.Size([16, 81]) torch.Size([16, 75]) torch.Size([16, 80]) torch.Size([16, 81]) torch.Size([16, 81])