CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutSign UpSign In
huggingface

Real-time collaboration for Jupyter Notebooks, Linux Terminals, LaTeX, VS Code, R IDE, and more,
all in one place. Commercial Alternative to JupyterHub.

GitHub Repository: huggingface/notebooks
Path: blob/main/course/fr/chapter6/section3_pt.ipynb
Views: 2555
Kernel: Python 3

Les pouvoirs spéciaux des tokenizers rapides (PyTorch)

Installez les bibliothèques 🤗 Transformers et 🤗 Datasets pour exécuter ce notebook.

!pip install datasets transformers[sentencepiece]
from transformers import AutoTokenizer tokenizer = AutoTokenizer.from_pretrained("camembert-base") example = "Je m'appelle Sylvain et je travaille à Hugging Face à Brooklyn." encoding = tokenizer(example) print(type(encoding))
tokenizer.is_fast
encoding.is_fast
encoding.tokens()
encoding.word_ids()
start, end = encoding.word_to_chars(3) example[start:end]
from transformers import pipeline token_classifier = pipeline("token-classification", model="Jean-Baptiste/camembert-ner") token_classifier("Je m'appelle Sylvain et je travaille à Hugging Face à Brooklyn.")
from transformers import pipeline token_classifier = pipeline("token-classification", model="Jean-Baptiste/camembert-ner", aggregation_strategy="simple") token_classifier("Je m'appelle Sylvain et je travaille à Hugging Face à Brooklyn.")
from transformers import AutoTokenizer, AutoModelForTokenClassification model_checkpoint = "Jean-Baptiste/camembert-ner" tokenizer = AutoTokenizer.from_pretrained(model_checkpoint) model = AutoModelForTokenClassification.from_pretrained(model_checkpoint) example = "Je m'appelle Sylvain et je travaille à Hugging Face à Brooklyn." inputs = tokenizer(example, return_tensors="pt") outputs = model(**inputs)
print(inputs["input_ids"].shape) print(outputs.logits.shape)
import torch probabilities = torch.nn.functional.softmax(outputs.logits, dim=-1)[0].tolist() predictions = outputs.logits.argmax(dim=-1)[0].tolist() print(predictions)
model.config.id2label
results = [] tokens = inputs.tokens() for idx, pred in enumerate(predictions): label = model.config.id2label[pred] if label != "O": results.append( {"entity": label, "score": probabilities[idx][pred], "word": tokens[idx]} ) print(results)
inputs_with_offsets = tokenizer(example, return_offsets_mapping=True) inputs_with_offsets["offset_mapping"]
example[12:14]
results = [] inputs_with_offsets = tokenizer(example, return_offsets_mapping=True) tokens = inputs_with_offsets.tokens() offsets = inputs_with_offsets["offset_mapping"] for idx, pred in enumerate(predictions): label = model.config.id2label[pred] if label != "O": start, end = offsets[idx] results.append( { "entity": label, "score": probabilities[idx][pred], "word": tokens[idx], "start": start, "end": end, } ) print(results)
example[39:51]
import numpy as np results = [] inputs_with_offsets = tokenizer(example, return_offsets_mapping=True) tokens = inputs_with_offsets.tokens() offsets = inputs_with_offsets["offset_mapping"] idx = 0 while idx < len(predictions): pred = predictions[idx] label = model.config.id2label[pred] if label != "O": # Enlevez le B- ou le I- label = label[2:] start, _ = offsets[idx] # Récupérer tous les tokens étiquetés avec I-label all_scores = [] while ( idx < len(predictions) and model.config.id2label[predictions[idx]] == f"I-{label}" ): all_scores.append(probabilities[idx][pred]) _, end = offsets[idx] idx += 1 # Le score est la moyenne de tous les scores des tokens de cette entité groupée score = np.mean(all_scores).item() word = example[start:end] results.append( { "entity_group": label, "score": score, "word": word, "start": start, "end": end, } ) idx += 1 print(results)