CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutSign UpSign In
huggingface

Real-time collaboration for Jupyter Notebooks, Linux Terminals, LaTeX, VS Code, R IDE, and more,
all in one place. Commercial Alternative to JupyterHub.

GitHub Repository: huggingface/notebooks
Path: blob/main/course/th/chapter6/section6.ipynb
Views: 2554
Kernel: Unknown Kernel

WordPiece tokenization

Install the Transformers, Datasets, and Evaluate libraries to run this notebook.

!pip install datasets evaluate transformers[sentencepiece]
corpus = [ "This is the Hugging Face course.", "This chapter is about tokenization.", "This section shows several tokenizer algorithms.", "Hopefully, you will be able to understand how they are trained and generate tokens.", ]
from transformers import AutoTokenizer tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")
from collections import defaultdict word_freqs = defaultdict(int) for text in corpus: words_with_offsets = tokenizer.backend_tokenizer.pre_tokenizer.pre_tokenize_str(text) new_words = [word for word, offset in words_with_offsets] for word in new_words: word_freqs[word] += 1 word_freqs
defaultdict( int, {'This': 3, 'is': 2, 'the': 1, 'Hugging': 1, 'Face': 1, 'Course': 1, '.': 4, 'chapter': 1, 'about': 1, 'tokenization': 1, 'section': 1, 'shows': 1, 'several': 1, 'tokenizer': 1, 'algorithms': 1, 'Hopefully': 1, ',': 1, 'you': 1, 'will': 1, 'be': 1, 'able': 1, 'to': 1, 'understand': 1, 'how': 1, 'they': 1, 'are': 1, 'trained': 1, 'and': 1, 'generate': 1, 'tokens': 1})
alphabet = [] for word in word_freqs.keys(): if word[0] not in alphabet: alphabet.append(word[0]) for letter in word[1:]: if f"##{letter}" not in alphabet: alphabet.append(f"##{letter}") alphabet.sort() alphabet print(alphabet)
['##a', '##b', '##c', '##d', '##e', '##f', '##g', '##h', '##i', '##k', '##l', '##m', '##n', '##o', '##p', '##r', '##s', '##t', '##u', '##v', '##w', '##y', '##z', ',', '.', 'C', 'F', 'H', 'T', 'a', 'b', 'c', 'g', 'h', 'i', 's', 't', 'u', 'w', 'y']
vocab = ["[PAD]", "[UNK]", "[CLS]", "[SEP]", "[MASK]"] + alphabet.copy()
splits = { word: [c if i == 0 else f"##{c}" for i, c in enumerate(word)] for word in word_freqs.keys() }
def compute_pair_scores(splits): letter_freqs = defaultdict(int) pair_freqs = defaultdict(int) for word, freq in word_freqs.items(): split = splits[word] if len(split) == 1: letter_freqs[split[0]] += freq continue for i in range(len(split) - 1): pair = (split[i], split[i + 1]) letter_freqs[split[i]] += freq pair_freqs[pair] += freq letter_freqs[split[-1]] += freq scores = { pair: freq / (letter_freqs[pair[0]] * letter_freqs[pair[1]]) for pair, freq in pair_freqs.items() } return scores
pair_scores = compute_pair_scores(splits) for i, key in enumerate(pair_scores.keys()): print(f"{key}: {pair_scores[key]}") if i >= 5: break
('T', '##h'): 0.125 ('##h', '##i'): 0.03409090909090909 ('##i', '##s'): 0.02727272727272727 ('i', '##s'): 0.1 ('t', '##h'): 0.03571428571428571 ('##h', '##e'): 0.011904761904761904
best_pair = "" max_score = None for pair, score in pair_scores.items(): if max_score is None or max_score < score: best_pair = pair max_score = score print(best_pair, max_score)
('a', '##b') 0.2
vocab.append("ab")
def merge_pair(a, b, splits): for word in word_freqs: split = splits[word] if len(split) == 1: continue i = 0 while i < len(split) - 1: if split[i] == a and split[i + 1] == b: merge = a + b[2:] if b.startswith("##") else a + b split = split[:i] + [merge] + split[i + 2 :] else: i += 1 splits[word] = split return splits
splits = merge_pair("a", "##b", splits) splits["about"]
['ab', '##o', '##u', '##t']
vocab_size = 70 while len(vocab) < vocab_size: scores = compute_pair_scores(splits) best_pair, max_score = "", None for pair, score in scores.items(): if max_score is None or max_score < score: best_pair = pair max_score = score splits = merge_pair(*best_pair, splits) new_token = ( best_pair[0] + best_pair[1][2:] if best_pair[1].startswith("##") else best_pair[0] + best_pair[1] ) vocab.append(new_token)
print(vocab)
['[PAD]', '[UNK]', '[CLS]', '[SEP]', '[MASK]', '##a', '##b', '##c', '##d', '##e', '##f', '##g', '##h', '##i', '##k', '##l', '##m', '##n', '##o', '##p', '##r', '##s', '##t', '##u', '##v', '##w', '##y', '##z', ',', '.', 'C', 'F', 'H', 'T', 'a', 'b', 'c', 'g', 'h', 'i', 's', 't', 'u', 'w', 'y', '##fu', 'Fa', 'Fac', '##ct', '##ful', '##full', '##fully', 'Th', 'ch', '##hm', 'cha', 'chap', 'chapt', '##thm', 'Hu', 'Hug', 'Hugg', 'sh', 'th', 'is', '##thms', '##za', '##zat', '##ut']
def encode_word(word): tokens = [] while len(word) > 0: i = len(word) while i > 0 and word[:i] not in vocab: i -= 1 if i == 0: return ["[UNK]"] tokens.append(word[:i]) word = word[i:] if len(word) > 0: word = f"##{word}" return tokens
print(encode_word("Hugging")) print(encode_word("HOgging"))
['Hugg', '##i', '##n', '##g'] ['[UNK]']
def tokenize(text): pre_tokenize_result = tokenizer._tokenizer.pre_tokenizer.pre_tokenize_str(text) pre_tokenized_text = [word for word, offset in pre_tokenize_result] encoded_words = [encode_word(word) for word in pre_tokenized_text] return sum(encoded_words, [])
tokenize("This is the Hugging Face course!")
['Th', '##i', '##s', 'is', 'th', '##e', 'Hugg', '##i', '##n', '##g', 'Fac', '##e', 'c', '##o', '##u', '##r', '##s', '##e', '[UNK]']