CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutSign UpSign In
huggingface

Real-time collaboration for Jupyter Notebooks, Linux Terminals, LaTeX, VS Code, R IDE, and more,
all in one place. Commercial Alternative to JupyterHub.

GitHub Repository: huggingface/notebooks
Path: blob/main/course/en/chapter6/section7.ipynb
Views: 2549
Kernel: Unknown Kernel

Unigram tokenization

Install the Transformers, Datasets, and Evaluate libraries to run this notebook.

!pip install datasets evaluate transformers[sentencepiece]
corpus = [ "This is the Hugging Face Course.", "This chapter is about tokenization.", "This section shows several tokenizer algorithms.", "Hopefully, you will be able to understand how they are trained and generate tokens.", ]
from transformers import AutoTokenizer tokenizer = AutoTokenizer.from_pretrained("xlnet-base-cased")
from collections import defaultdict word_freqs = defaultdict(int) for text in corpus: words_with_offsets = tokenizer.backend_tokenizer.pre_tokenizer.pre_tokenize_str(text) new_words = [word for word, offset in words_with_offsets] for word in new_words: word_freqs[word] += 1 word_freqs
char_freqs = defaultdict(int) subwords_freqs = defaultdict(int) for word, freq in word_freqs.items(): for i in range(len(word)): char_freqs[word[i]] += freq # Loop through the subwords of length at least 2 for j in range(i + 2, len(word) + 1): subwords_freqs[word[i:j]] += freq # Sort subwords by frequency sorted_subwords = sorted(subwords_freqs.items(), key=lambda x: x[1], reverse=True) sorted_subwords[:10]
[('▁t', 7), ('is', 5), ('er', 5), ('▁a', 5), ('▁to', 4), ('to', 4), ('en', 4), ('▁T', 3), ('▁Th', 3), ('▁Thi', 3)]
token_freqs = list(char_freqs.items()) + sorted_subwords[: 300 - len(char_freqs)] token_freqs = {token: freq for token, freq in token_freqs}
from math import log total_sum = sum([freq for token, freq in token_freqs.items()]) model = {token: -log(freq / total_sum) for token, freq in token_freqs.items()}
def encode_word(word, model): best_segmentations = [{"start": 0, "score": 1}] + [ {"start": None, "score": None} for _ in range(len(word)) ] for start_idx in range(len(word)): # This should be properly filled by the previous steps of the loop best_score_at_start = best_segmentations[start_idx]["score"] for end_idx in range(start_idx + 1, len(word) + 1): token = word[start_idx:end_idx] if token in model and best_score_at_start is not None: score = model[token] + best_score_at_start # If we have found a better segmentation ending at end_idx, we update if ( best_segmentations[end_idx]["score"] is None or best_segmentations[end_idx]["score"] > score ): best_segmentations[end_idx] = {"start": start_idx, "score": score} segmentation = best_segmentations[-1] if segmentation["score"] is None: # We did not find a tokenization of the word -> unknown return ["<unk>"], None score = segmentation["score"] start = segmentation["start"] end = len(word) tokens = [] while start != 0: tokens.insert(0, word[start:end]) next_start = best_segmentations[start]["start"] end = start start = next_start tokens.insert(0, word[start:end]) return tokens, score
print(encode_word("Hopefully", model)) print(encode_word("This", model))
(['H', 'o', 'p', 'e', 'f', 'u', 'll', 'y'], 41.5157494601402) (['This'], 6.288267030694535)
def compute_loss(model): loss = 0 for word, freq in word_freqs.items(): _, word_loss = encode_word(word, model) loss += freq * word_loss return loss
compute_loss(model)
413.10377642940875
import copy def compute_scores(model): scores = {} model_loss = compute_loss(model) for token, score in model.items(): # We always keep tokens of length 1 if len(token) == 1: continue model_without_token = copy.deepcopy(model) _ = model_without_token.pop(token) scores[token] = compute_loss(model_without_token) - model_loss return scores
scores = compute_scores(model) print(scores["ll"]) print(scores["his"])
6.376412403623874 0.0
percent_to_remove = 0.1 while len(model) > 100: scores = compute_scores(model) sorted_scores = sorted(scores.items(), key=lambda x: x[1]) # Remove percent_to_remove tokens with the lowest scores. for i in range(int(len(model) * percent_to_remove)): _ = token_freqs.pop(sorted_scores[i][0]) total_sum = sum([freq for token, freq in token_freqs.items()]) model = {token: -log(freq / total_sum) for token, freq in token_freqs.items()}
def tokenize(text, model): words_with_offsets = tokenizer.backend_tokenizer.pre_tokenizer.pre_tokenize_str(text) pre_tokenized_text = [word for word, offset in words_with_offsets] encoded_words = [encode_word(word, model)[0] for word in pre_tokenized_text] return sum(encoded_words, []) tokenize("This is the Hugging Face course.", model)
['▁This', '▁is', '▁the', '▁Hugging', '▁Face', '▁', 'c', 'ou', 'r', 's', 'e', '.']