CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutSign UpSign In
huggingface

Real-time collaboration for Jupyter Notebooks, Linux Terminals, LaTeX, VS Code, R IDE, and more,
all in one place. Commercial Alternative to JupyterHub.

GitHub Repository: huggingface/notebooks
Path: blob/main/course/videos/batch_inputs_pt.ipynb
Views: 2542
Kernel: Unknown Kernel

This notebook regroups the code sample of the video below, which is a part of the Hugging Face course.

#@title from IPython.display import HTML HTML('<iframe width="560" height="315" src="https://www.youtube.com/embed/M6adb1j2jPI?rel=0&amp;controls=0&amp;showinfo=0" frameborder="0" allowfullscreen></iframe>')

Install the Transformers and Datasets libraries to run this notebook.

! pip install datasets transformers[sentencepiece]
from transformers import AutoTokenizer checkpoint = "distilbert-base-uncased-finetuned-sst-2-english" tokenizer = AutoTokenizer.from_pretrained(checkpoint) sentences = [ "I've been waiting for a HuggingFace course my whole life.", "I hate this.", ] tokens = [tokenizer.tokenize(sentence) for sentence in sentences] ids = [tokenizer.convert_tokens_to_ids(token) for token in tokens] print(ids[0]) print(ids[1])
[1045, 1005, 2310, 2042, 3403, 2005, 1037, 17662, 12172, 2607, 2026, 2878, 2166, 1012] [1045, 5223, 2023, 1012]
import torch ids = [[1045, 1005, 2310, 2042, 3403, 2005, 1037, 17662, 12172, 2607, 2026, 2878, 2166, 1012], [1045, 5223, 2023, 1012]] input_ids = torch.tensor(ids)
--------------------------------------------------------------------------- ValueError Traceback (most recent call last) <ipython-input-2-b3483c81e2fd> in <module> 4 [1045, 5223, 2023, 1012]] 5 ----> 6 input_ids = torch.tensor(ids) ValueError: expected sequence of length 14 at dim 1 (got 4)
import torch ids = [[1045, 1005, 2310, 2042, 3403, 2005, 1037, 17662, 12172, 2607, 2026, 2878, 2166, 1012], [1045, 5223, 2023, 1012, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]] input_ids = torch.tensor(ids)
from transformers import AutoTokenizer tokenizer = AutoTokenizer.from_pretrained(checkpoint) tokenizer.pad_token_id
0
from transformers import AutoModelForSequenceClassification ids1 = torch.tensor( [[1045, 1005, 2310, 2042, 3403, 2005, 1037, 17662, 12172, 2607, 2026, 2878, 2166, 1012]] ) ids2 = torch.tensor([[1045, 5223, 2023, 1012]]) all_ids = torch.tensor( [[1045, 1005, 2310, 2042, 3403, 2005, 1037, 17662, 12172, 2607, 2026, 2878, 2166, 1012], [1045, 5223, 2023, 1012, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]] ) model = AutoModelForSequenceClassification.from_pretrained(checkpoint) print(model(ids1).logits) print(model(ids2).logits) print(model(all_ids).logits)
tensor([[-2.7276, 2.8789]], grad_fn=<AddmmBackward>) tensor([[ 3.9497, -3.1357]], grad_fn=<AddmmBackward>) tensor([[-2.7276, 2.8789], [ 1.5444, -1.3998]], grad_fn=<AddmmBackward>)
all_ids = torch.tensor( [[1045, 1005, 2310, 2042, 3403, 2005, 1037, 17662, 12172, 2607, 2026, 2878, 2166, 1012], [1045, 5223, 2023, 1012, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]] ) attention_mask = torch.tensor( [[ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [ 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]] )
model = AutoModelForSequenceClassification.from_pretrained(checkpoint) output1 = model(ids1) output2 = model(ids2) print(output1.logits) print(output2.logits)
tensor([[-2.7276, 2.8789]], grad_fn=<AddmmBackward>) tensor([[ 3.9497, -3.1357]], grad_fn=<AddmmBackward>)
output = model(all_ids, attention_mask=attention_mask) print(output.logits)
tensor([[-2.7276, 2.8789], [ 3.9497, -3.1357]], grad_fn=<AddmmBackward>)
from transformers import AutoTokenizer checkpoint = "distilbert-base-uncased-finetuned-sst-2-english" tokenizer = AutoTokenizer.from_pretrained(checkpoint) sentences = [ "I've been waiting for a HuggingFace course my whole life.", "I hate this.", ] print(tokenizer(sentences, padding=True))
{'input_ids': [[101, 1045, 1005, 2310, 2042, 3403, 2005, 1037, 17662, 12172, 2607, 2026, 2878, 2166, 1012, 102], [101, 1045, 5223, 2023, 1012, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]}