Real-time collaboration for Jupyter Notebooks, Linux Terminals, LaTeX, VS Code, R IDE, and more,
all in one place. Commercial Alternative to JupyterHub.
Real-time collaboration for Jupyter Notebooks, Linux Terminals, LaTeX, VS Code, R IDE, and more,
all in one place. Commercial Alternative to JupyterHub.
Path: blob/main/course/fr/chapter6/section3_tf.ipynb
Views: 2548
Kernel: Python 3
Les pouvoirs spéciaux des tokenizers rapides (TensorFlow)
Installez les bibliothèques 🤗 Transformers et 🤗 Datasets pour exécuter ce notebook.
In [ ]:
!pip install datasets transformers[sentencepiece]
In [ ]:
from transformers import AutoTokenizer tokenizer = AutoTokenizer.from_pretrained("camembert-base") example = "Je m'appelle Sylvain et je travaille à Hugging Face à Brooklyn." encoding = tokenizer(example) print(type(encoding))
In [ ]:
tokenizer.is_fast
In [ ]:
encoding.is_fast
In [ ]:
encoding.tokens()
In [ ]:
encoding.word_ids()
In [ ]:
start, end = encoding.word_to_chars(3) example[start:end]
In [ ]:
from transformers import pipeline token_classifier = pipeline("token-classification", model="Jean-Baptiste/camembert-ner") token_classifier("Je m'appelle Sylvain et je travaille à Hugging Face à Brooklyn.")
In [ ]:
from transformers import pipeline token_classifier = pipeline("token-classification", model="Jean-Baptiste/camembert-ner", aggregation_strategy="simple") token_classifier("Je m'appelle Sylvain et je travaille à Hugging Face à Brooklyn.")
In [ ]:
from transformers import AutoTokenizer, TFAutoModelForTokenClassification model_checkpoint = "Jean-Baptiste/camembert-ner" tokenizer = AutoTokenizer.from_pretrained(model_checkpoint) model = TFAutoModelForTokenClassification.from_pretrained(model_checkpoint, from_pt=True) example = "Je m'appelle Sylvain et je travaille à Hugging Face à Brooklyn." inputs = tokenizer(example, return_tensors="tf") outputs = model(**inputs)
In [ ]:
print(inputs["input_ids"].shape) print(outputs.logits.shape)
In [ ]:
import tensorflow as tf probabilities = tf.math.softmax(outputs.logits, axis=-1)[0] probabilities = probabilities.numpy().tolist() predictions = tf.math.argmax(outputs.logits, axis=-1)[0] predictions = predictions.numpy().tolist() print(predictions)
In [ ]:
model.config.id2label
In [ ]:
results = [] tokens = inputs.tokens() for idx, pred in enumerate(predictions): label = model.config.id2label[pred] if label != "O": results.append( {"entity": label, "score": probabilities[idx][pred], "word": tokens[idx]} ) print(results)
In [ ]:
inputs_with_offsets = tokenizer(example, return_offsets_mapping=True) inputs_with_offsets["offset_mapping"]
In [ ]:
example[12:14]
In [ ]:
results = [] inputs_with_offsets = tokenizer(example, return_offsets_mapping=True) tokens = inputs_with_offsets.tokens() offsets = inputs_with_offsets["offset_mapping"] for idx, pred in enumerate(predictions): label = model.config.id2label[pred] if label != "O": start, end = offsets[idx] results.append( { "entity": label, "score": probabilities[idx][pred], "word": tokens[idx], "start": start, "end": end, } ) print(results)
In [ ]:
example[33:45]
In [ ]:
import numpy as np results = [] inputs_with_offsets = tokenizer(example, return_offsets_mapping=True) tokens = inputs_with_offsets.tokens() offsets = inputs_with_offsets["offset_mapping"] idx = 0 while idx < len(predictions): pred = predictions[idx] label = model.config.id2label[pred] if label != "O": # Enlevez le B- ou le I- label = label[2:] start, _ = offsets[idx] # Récupérer tous les tokens étiquetés avec I-label all_scores = [] while ( idx < len(predictions) and model.config.id2label[predictions[idx]] == f"I-{label}" ): all_scores.append(probabilities[idx][pred]) _, end = offsets[idx] idx += 1 # Le score est la moyenne de tous les scores des tokens de cette entité groupée score = np.mean(all_scores).item() word = example[start:end] results.append( { "entity_group": label, "score": score, "word": word, "start": start, "end": end, } ) idx += 1 print(results)