Les pouvoirs spéciaux des tokenizers rapides (TensorFlow)

Installez les bibliothèques 🤗 Transformers et 🤗 Datasets pour exécuter ce notebook.

In [ ]:

!pip install datasets transformers[sentencepiece]

In [ ]:

from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("camembert-base")
example = "Je m'appelle Sylvain et je travaille à Hugging Face à Brooklyn."
encoding = tokenizer(example)
print(type(encoding))

In [ ]:

tokenizer.is_fast

In [ ]:

encoding.is_fast

In [ ]:

encoding.tokens()

In [ ]:

encoding.word_ids()

In [ ]:

start, end = encoding.word_to_chars(3)
example[start:end]

In [ ]:

from transformers import pipeline

token_classifier = pipeline("token-classification", model="Jean-Baptiste/camembert-ner")
token_classifier("Je m'appelle Sylvain et je travaille à Hugging Face à Brooklyn.")

In [ ]:

from transformers import pipeline

token_classifier = pipeline("token-classification", model="Jean-Baptiste/camembert-ner", aggregation_strategy="simple")
token_classifier("Je m'appelle Sylvain et je travaille à Hugging Face à Brooklyn.")

In [ ]:

from transformers import AutoTokenizer, TFAutoModelForTokenClassification

model_checkpoint = "Jean-Baptiste/camembert-ner"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
model = TFAutoModelForTokenClassification.from_pretrained(model_checkpoint, from_pt=True)

example = "Je m'appelle Sylvain et je travaille à Hugging Face à Brooklyn."
inputs = tokenizer(example, return_tensors="tf")
outputs = model(**inputs)

In [ ]:

print(inputs["input_ids"].shape)
print(outputs.logits.shape)

In [ ]:

import tensorflow as tf

probabilities = tf.math.softmax(outputs.logits, axis=-1)[0]
probabilities = probabilities.numpy().tolist()
predictions = tf.math.argmax(outputs.logits, axis=-1)[0]
predictions = predictions.numpy().tolist()
print(predictions)

In [ ]:

model.config.id2label

In [ ]:

results = []
tokens = inputs.tokens()

for idx, pred in enumerate(predictions):
    label = model.config.id2label[pred]
    if label != "O":
        results.append(
            {"entity": label, "score": probabilities[idx][pred], "word": tokens[idx]}
        )

print(results)

In [ ]:

inputs_with_offsets = tokenizer(example, return_offsets_mapping=True)
inputs_with_offsets["offset_mapping"]

In [ ]:

example[12:14]

In [ ]:

results = []
inputs_with_offsets = tokenizer(example, return_offsets_mapping=True)
tokens = inputs_with_offsets.tokens()
offsets = inputs_with_offsets["offset_mapping"]

for idx, pred in enumerate(predictions):
    label = model.config.id2label[pred]
    if label != "O":
        start, end = offsets[idx]
        results.append(
            {
                "entity": label,
                "score": probabilities[idx][pred],
                "word": tokens[idx],
                "start": start,
                "end": end,
            }
        )

print(results)

In [ ]:

example[33:45]

In [ ]:

import numpy as np

results = []
inputs_with_offsets = tokenizer(example, return_offsets_mapping=True)
tokens = inputs_with_offsets.tokens()
offsets = inputs_with_offsets["offset_mapping"]

idx = 0
while idx < len(predictions):
    pred = predictions[idx]
    label = model.config.id2label[pred]
    if label != "O":
        # Enlevez le B- ou le I-
        label = label[2:]
        start, _ = offsets[idx]

        # Récupérer tous les tokens étiquetés avec I-label
        all_scores = []
        while (
            idx < len(predictions)
            and model.config.id2label[predictions[idx]] == f"I-{label}"
        ):
            all_scores.append(probabilities[idx][pred])
            _, end = offsets[idx]
            idx += 1

        # Le score est la moyenne de tous les scores des tokens de cette entité groupée
        score = np.mean(all_scores).item()
        word = example[start:end]
        results.append(
            {
                "entity_group": label,
                "score": score,
                "word": word,
                "start": start,
                "end": end,
            }
        )
    idx += 1

print(results)

Real-time collaboration for Jupyter Notebooks, Linux Terminals, LaTeX, VS Code, R IDE, and more,
all in one place. Commercial Alternative to JupyterHub.

Les pouvoirs spéciaux des tokenizers rapides (TensorFlow)

Product

Resources

Company

Real-time collaboration for Jupyter Notebooks, Linux Terminals, LaTeX, VS Code, R IDE, and more, all in one place. Commercial Alternative to JupyterHub.

Les pouvoirs spéciaux des tokenizers rapides (TensorFlow)

Real-time collaboration for Jupyter Notebooks, Linux Terminals, LaTeX, VS Code, R IDE, and more,
all in one place. Commercial Alternative to JupyterHub.