CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutSign UpSign In
huggingface

Real-time collaboration for Jupyter Notebooks, Linux Terminals, LaTeX, VS Code, R IDE, and more,
all in one place. Commercial Alternative to JupyterHub.

GitHub Repository: huggingface/notebooks
Path: blob/main/course/videos/token_pipeline_tf.ipynb
Views: 2542
Kernel: Unknown Kernel

This notebook regroups the code sample of the video below, which is a part of the Hugging Face course.

#@title from IPython.display import HTML HTML('<iframe width="560" height="315" src="https://www.youtube.com/embed/PrX4CjrVnNc?rel=0&amp;controls=0&amp;showinfo=0" frameborder="0" allowfullscreen></iframe>')

Install the Transformers and Datasets libraries to run this notebook.

! pip install datasets transformers[sentencepiece]
from transformers import pipeline token_classifier = pipeline("token-classification") token_classifier("My name is Sylvain and I work at Hugging Face in Brooklyn.")
token_classifier = pipeline("token-classification", aggregation_strategy="simple") token_classifier("My name is Sylvain and I work at Hugging Face in Brooklyn.")
from transformers import AutoTokenizer, TFAutoModelForTokenClassification model_checkpoint = "dbmdz/bert-large-cased-finetuned-conll03-english" tokenizer = AutoTokenizer.from_pretrained(model_checkpoint) model = TFAutoModelForTokenClassification.from_pretrained(model_checkpoint) example = "My name is Sylvain and I work at Hugging Face in Brooklyn." inputs = tokenizer(example, return_tensors="tf") outputs = model(**inputs) print(inputs["input_ids"].shape) print(outputs.logits.shape)
import tensorflow as tf probabilities = tf.math.softmax(outputs.logits, axis=-1)[0] probabilities = probabilities.numpy().tolist() predictions = tf.math.argmax(predictions, axis=-1)[0] predictions = predictions.numpy().tolist() print(predictions)
model.config.id2label
results = [] inputs_with_offsets = tokenizer(example, return_offsets_mapping=True) tokens = inputs_with_offsets.tokens() offsets = inputs_with_offsets["offset_mapping"] for idx, pred in enumerate(predictions): label = model.config.id2label[pred] if label != "O": start, end = offsets[idx] results.append( {"entity": label, "score": probabilities[idx][pred], "word": tokens[idx], "start": start, "end": end} ) print(results)
import numpy as np label_map = model.config.id2label results = [] idx = 0 while idx < len(predictions): pred = predictions[idx] label = label_map[pred] if label != "O": # Remove the B- or I- label = label[2:] start, _ = offsets[idx] # Grab all the tokens labeled with I-label all_scores = [] while idx < len(predictions) and label_map[predictions[idx]] == f"I-{label}": all_scores.append(probabilities[idx][pred]) _, end = offsets[idx] idx += 1 # The score is the mean of all the scores of the token in that grouped entity. score = np.mean(all_scores).item() word = example[start:end] results.append( {"entity_group": label, "score": score, "word": word, "start": start, "end": end} ) idx += 1