CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutSign UpSign In
huggingface

Real-time collaboration for Jupyter Notebooks, Linux Terminals, LaTeX, VS Code, R IDE, and more,
all in one place. Commercial Alternative to JupyterHub.

GitHub Repository: huggingface/notebooks
Path: blob/main/course/videos/datasets_overview_pt.ipynb
Views: 2542
Kernel: Unknown Kernel

This notebook regroups the code sample of the video below, which is a part of the Hugging Face course.

#@title from IPython.display import HTML HTML('<iframe width="560" height="315" src="https://www.youtube.com/embed/_BZearw7f0w?rel=0&amp;controls=0&amp;showinfo=0" frameborder="0" allowfullscreen></iframe>')

Install the Transformers and Datasets libraries to run this notebook.

! pip install datasets transformers[sentencepiece]
from datasets import load_dataset raw_datasets = load_dataset("glue", "mrpc") raw_datasets
Reusing dataset glue (/home/sgugger/.cache/huggingface/datasets/glue/mrpc/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad)
DatasetDict({ train: Dataset({ features: ['sentence1', 'sentence2', 'label', 'idx'], num_rows: 3668 }) validation: Dataset({ features: ['sentence1', 'sentence2', 'label', 'idx'], num_rows: 408 }) test: Dataset({ features: ['sentence1', 'sentence2', 'label', 'idx'], num_rows: 1725 }) })
raw_datasets["train"]
Dataset({ features: ['sentence1', 'sentence2', 'label', 'idx'], num_rows: 3668 })
raw_datasets["train"][6]
{'idx': 6, 'label': 0, 'sentence1': 'The Nasdaq had a weekly gain of 17.27 , or 1.2 percent , closing at 1,520.15 on Friday .', 'sentence2': 'The tech-laced Nasdaq Composite .IXIC rallied 30.46 points , or 2.04 percent , to 1,520.15 .'}
raw_datasets["train"][:5]
{'idx': [0, 1, 2, 3, 4], 'label': [1, 0, 1, 0, 1], 'sentence1': ['Amrozi accused his brother , whom he called " the witness " , of deliberately distorting his evidence .', "Yucaipa owned Dominick 's before selling the chain to Safeway in 1998 for $ 2.5 billion .", 'They had published an advertisement on the Internet on June 10 , offering the cargo for sale , he added .', 'Around 0335 GMT , Tab shares were up 19 cents , or 4.4 % , at A $ 4.56 , having earlier set a record high of A $ 4.57 .', 'The stock rose $ 2.11 , or about 11 percent , to close Friday at $ 21.51 on the New York Stock Exchange .'], 'sentence2': ['Referring to him as only " the witness " , Amrozi accused his brother of deliberately distorting his evidence .', "Yucaipa bought Dominick 's in 1995 for $ 693 million and sold it to Safeway for $ 1.8 billion in 1998 .", "On June 10 , the ship 's owners had published an advertisement on the Internet , offering the explosives for sale .", 'Tab shares jumped 20 cents , or 4.6 % , to set a record closing high at A $ 4.57 .', 'PG & E Corp. shares jumped $ 1.63 or 8 percent to $ 21.03 on the New York Stock Exchange on Friday .']}
raw_datasets["train"].features
{'sentence1': Value(dtype='string', id=None), 'sentence2': Value(dtype='string', id=None), 'label': ClassLabel(num_classes=2, names=['not_equivalent', 'equivalent'], names_file=None, id=None), 'idx': Value(dtype='int32', id=None)}
from transformers import AutoTokenizer checkpoint = "bert-base-cased" tokenizer = AutoTokenizer.from_pretrained(checkpoint) def tokenize_function(example): return tokenizer( example["sentence1"], example["sentence2"], padding="max_length", truncation=True, max_length=128 ) tokenized_datasets = raw_datasets.map(tokenize_function) print(tokenized_datasets.column_names)
{'train': ['attention_mask', 'idx', 'input_ids', 'label', 'sentence1', 'sentence2', 'token_type_ids'], 'validation': ['attention_mask', 'idx', 'input_ids', 'label', 'sentence1', 'sentence2', 'token_type_ids'], 'test': ['attention_mask', 'idx', 'input_ids', 'label', 'sentence1', 'sentence2', 'token_type_ids']}
from transformers import AutoTokenizer checkpoint = "bert-base-cased" tokenizer = AutoTokenizer.from_pretrained(checkpoint) def tokenize_function(examples): return tokenizer( examples["sentence1"], examples["sentence2"], padding="max_length", truncation=True, max_length=128 ) tokenized_datasets = raw_datasets.map(tokenize_function, batched=True)
Loading cached processed dataset at /home/sgugger/.cache/huggingface/datasets/glue/mrpc/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad/cache-2b2682faffe74c3f.arrow
Loading cached processed dataset at /home/sgugger/.cache/huggingface/datasets/glue/mrpc/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad/cache-754363c6c40d803c.arrow
tokenized_datasets = tokenized_datasets.remove_columns(["idx", "sentence1", "sentence2"]) tokenized_datasets = tokenized_datasets.rename_column("label", "labels") tokenized_datasets = tokenized_datasets.with_format("torch") tokenized_datasets["train"]
Dataset({ features: ['attention_mask', 'input_ids', 'labels', 'token_type_ids'], num_rows: 3668 })
small_train_dataset = tokenized_datasets["train"].select(range(100))