CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutSign UpSign In
huggingface

Real-time collaboration for Jupyter Notebooks, Linux Terminals, LaTeX, VS Code, R IDE, and more,
all in one place. Commercial Alternative to JupyterHub.

GitHub Repository: huggingface/notebooks
Path: blob/main/course/en/chapter6/section3_tf.ipynb
Views: 2555
Kernel: Unknown Kernel

Fast tokenizers' special powers (TensorFlow)

Install the Transformers, Datasets, and Evaluate libraries to run this notebook.

!pip install datasets evaluate transformers[sentencepiece]
from transformers import AutoTokenizer tokenizer = AutoTokenizer.from_pretrained("bert-base-cased") example = "My name is Sylvain and I work at Hugging Face in Brooklyn." encoding = tokenizer(example) print(type(encoding))
<class 'transformers.tokenization_utils_base.BatchEncoding'>
tokenizer.is_fast
True
encoding.is_fast
True
encoding.tokens()
['[CLS]', 'My', 'name', 'is', 'S', '##yl', '##va', '##in', 'and', 'I', 'work', 'at', 'Hu', '##gging', 'Face', 'in', 'Brooklyn', '.', '[SEP]']
encoding.word_ids()
[None, 0, 1, 2, 3, 3, 3, 3, 4, 5, 6, 7, 8, 8, 9, 10, 11, 12, None]
start, end = encoding.word_to_chars(3) example[start:end]
Sylvain
from transformers import pipeline token_classifier = pipeline("token-classification") token_classifier("My name is Sylvain and I work at Hugging Face in Brooklyn.")
[{'entity': 'I-PER', 'score': 0.9993828, 'index': 4, 'word': 'S', 'start': 11, 'end': 12}, {'entity': 'I-PER', 'score': 0.99815476, 'index': 5, 'word': '##yl', 'start': 12, 'end': 14}, {'entity': 'I-PER', 'score': 0.99590725, 'index': 6, 'word': '##va', 'start': 14, 'end': 16}, {'entity': 'I-PER', 'score': 0.9992327, 'index': 7, 'word': '##in', 'start': 16, 'end': 18}, {'entity': 'I-ORG', 'score': 0.97389334, 'index': 12, 'word': 'Hu', 'start': 33, 'end': 35}, {'entity': 'I-ORG', 'score': 0.976115, 'index': 13, 'word': '##gging', 'start': 35, 'end': 40}, {'entity': 'I-ORG', 'score': 0.98879766, 'index': 14, 'word': 'Face', 'start': 41, 'end': 45}, {'entity': 'I-LOC', 'score': 0.99321055, 'index': 16, 'word': 'Brooklyn', 'start': 49, 'end': 57}]
from transformers import pipeline token_classifier = pipeline("token-classification", aggregation_strategy="simple") token_classifier("My name is Sylvain and I work at Hugging Face in Brooklyn.")
[{'entity_group': 'PER', 'score': 0.9981694, 'word': 'Sylvain', 'start': 11, 'end': 18}, {'entity_group': 'ORG', 'score': 0.97960204, 'word': 'Hugging Face', 'start': 33, 'end': 45}, {'entity_group': 'LOC', 'score': 0.99321055, 'word': 'Brooklyn', 'start': 49, 'end': 57}]
from transformers import AutoTokenizer, TFAutoModelForTokenClassification model_checkpoint = "dbmdz/bert-large-cased-finetuned-conll03-english" tokenizer = AutoTokenizer.from_pretrained(model_checkpoint) model = TFAutoModelForTokenClassification.from_pretrained(model_checkpoint) example = "My name is Sylvain and I work at Hugging Face in Brooklyn." inputs = tokenizer(example, return_tensors="tf") outputs = model(**inputs)
print(inputs["input_ids"].shape) print(outputs.logits.shape)
(1, 19) (1, 19, 9)
import tensorflow as tf probabilities = tf.math.softmax(outputs.logits, axis=-1)[0] probabilities = probabilities.numpy().tolist() predictions = tf.math.argmax(outputs.logits, axis=-1)[0] predictions = predictions.numpy().tolist() print(predictions)
[0, 0, 0, 0, 4, 4, 4, 4, 0, 0, 0, 0, 6, 6, 6, 0, 8, 0, 0]
model.config.id2label
{0: 'O', 1: 'B-MISC', 2: 'I-MISC', 3: 'B-PER', 4: 'I-PER', 5: 'B-ORG', 6: 'I-ORG', 7: 'B-LOC', 8: 'I-LOC'}
results = [] tokens = inputs.tokens() for idx, pred in enumerate(predictions): label = model.config.id2label[pred] if label != "O": results.append( {"entity": label, "score": probabilities[idx][pred], "word": tokens[idx]} ) print(results)
[{'entity': 'I-PER', 'score': 0.9993828, 'index': 4, 'word': 'S'}, {'entity': 'I-PER', 'score': 0.99815476, 'index': 5, 'word': '##yl'}, {'entity': 'I-PER', 'score': 0.99590725, 'index': 6, 'word': '##va'}, {'entity': 'I-PER', 'score': 0.9992327, 'index': 7, 'word': '##in'}, {'entity': 'I-ORG', 'score': 0.97389334, 'index': 12, 'word': 'Hu'}, {'entity': 'I-ORG', 'score': 0.976115, 'index': 13, 'word': '##gging'}, {'entity': 'I-ORG', 'score': 0.98879766, 'index': 14, 'word': 'Face'}, {'entity': 'I-LOC', 'score': 0.99321055, 'index': 16, 'word': 'Brooklyn'}]
inputs_with_offsets = tokenizer(example, return_offsets_mapping=True) inputs_with_offsets["offset_mapping"]
[(0, 0), (0, 2), (3, 7), (8, 10), (11, 12), (12, 14), (14, 16), (16, 18), (19, 22), (23, 24), (25, 29), (30, 32), (33, 35), (35, 40), (41, 45), (46, 48), (49, 57), (57, 58), (0, 0)]
example[12:14]
yl
results = [] inputs_with_offsets = tokenizer(example, return_offsets_mapping=True) tokens = inputs_with_offsets.tokens() offsets = inputs_with_offsets["offset_mapping"] for idx, pred in enumerate(predictions): label = model.config.id2label[pred] if label != "O": start, end = offsets[idx] results.append( { "entity": label, "score": probabilities[idx][pred], "word": tokens[idx], "start": start, "end": end, } ) print(results)
[{'entity': 'I-PER', 'score': 0.9993828, 'index': 4, 'word': 'S', 'start': 11, 'end': 12}, {'entity': 'I-PER', 'score': 0.99815476, 'index': 5, 'word': '##yl', 'start': 12, 'end': 14}, {'entity': 'I-PER', 'score': 0.99590725, 'index': 6, 'word': '##va', 'start': 14, 'end': 16}, {'entity': 'I-PER', 'score': 0.9992327, 'index': 7, 'word': '##in', 'start': 16, 'end': 18}, {'entity': 'I-ORG', 'score': 0.97389334, 'index': 12, 'word': 'Hu', 'start': 33, 'end': 35}, {'entity': 'I-ORG', 'score': 0.976115, 'index': 13, 'word': '##gging', 'start': 35, 'end': 40}, {'entity': 'I-ORG', 'score': 0.98879766, 'index': 14, 'word': 'Face', 'start': 41, 'end': 45}, {'entity': 'I-LOC', 'score': 0.99321055, 'index': 16, 'word': 'Brooklyn', 'start': 49, 'end': 57}]
example[33:45]
Hugging Face
import numpy as np results = [] inputs_with_offsets = tokenizer(example, return_offsets_mapping=True) tokens = inputs_with_offsets.tokens() offsets = inputs_with_offsets["offset_mapping"] idx = 0 while idx < len(predictions): pred = predictions[idx] label = model.config.id2label[pred] if label != "O": # Remove the B- or I- label = label[2:] start, _ = offsets[idx] # Grab all the tokens labeled with I-label all_scores = [] while ( idx < len(predictions) and model.config.id2label[predictions[idx]] == f"I-{label}" ): all_scores.append(probabilities[idx][pred]) _, end = offsets[idx] idx += 1 # The score is the mean of all the scores of the tokens in that grouped entity score = np.mean(all_scores).item() word = example[start:end] results.append( { "entity_group": label, "score": score, "word": word, "start": start, "end": end, } ) idx += 1 print(results)
[{'entity_group': 'PER', 'score': 0.9981694, 'word': 'Sylvain', 'start': 11, 'end': 18}, {'entity_group': 'ORG', 'score': 0.97960204, 'word': 'Hugging Face', 'start': 33, 'end': 45}, {'entity_group': 'LOC', 'score': 0.99321055, 'word': 'Brooklyn', 'start': 49, 'end': 57}]