CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutSign UpSign In
huggingface

Real-time collaboration for Jupyter Notebooks, Linux Terminals, LaTeX, VS Code, R IDE, and more,
all in one place. Commercial Alternative to JupyterHub.

GitHub Repository: huggingface/notebooks
Path: blob/main/course/fr/chapter7/section7_pt.ipynb
Views: 2555
Kernel: Python 3

Réponses aux questions (PyTorch)

Installez les bibliothèques 🤗 Datasets et 🤗 Transformers pour exécuter ce notebook.

!pip install datasets transformers[sentencepiece] !pip install accelerate # Pour exécuter l'entraînement sur TPU, vous devez décommenter la ligne suivante : # !pip install cloud-tpu-client==0.10 torch==1.9.0 https://storage.googleapis.com/tpu-pytorch/wheels/torch_xla-1.9-cp37-cp37m-linux_x86_64.whl !apt install git-lfs

Vous aurez besoin de configurer git, adaptez votre email et votre nom dans la cellule suivante.

!git config --global user.email "[email protected]" !git config --global user.name "Your Name"

Vous devrez également être connecté au Hub d'Hugging Face. Exécutez ce qui suit et entrez vos informations d'identification.

from huggingface_hub import notebook_login notebook_login()
from datasets import load_dataset raw_datasets = load_dataset("piaf") # Piaf n'ayant pas de jeu de données de test, nous en créons un raw_datasets = raw_datasets['train'] raw_datasets = raw_datasets.train_test_split(test_size=0.2, shuffle=True)
raw_datasets
print("Context: ", raw_datasets["train"][0]["context"]) print("Question: ", raw_datasets["train"][0]["question"]) print("Answer: ", raw_datasets["train"][0]["answers"])
raw_datasets["train"].filter(lambda x: len(x["answers"]["text"]) != 1)
print(raw_datasets["test"][0]["answers"]) print(raw_datasets["test"][2]["answers"])
print(raw_datasets["test"][2]["context"]) print(raw_datasets["test"][2]["question"])
from transformers import AutoTokenizer model_checkpoint = "camembert-base" tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
tokenizer.is_fast
context = raw_datasets["train"][0]["context"] question = raw_datasets["train"][0]["question"] inputs = tokenizer(question, context) tokenizer.decode(inputs["input_ids"])
inputs = tokenizer( question, context, max_length=100, truncation="only_second", stride=50, return_overflowing_tokens=True, ) for ids in inputs["input_ids"]: print(tokenizer.decode(ids))
inputs = tokenizer( question, context, max_length=100, truncation="only_second", stride=50, return_overflowing_tokens=True, return_offsets_mapping=True, ) inputs.keys()
inputs["overflow_to_sample_mapping"]
inputs = tokenizer( raw_datasets["train"][2:6]["question"], raw_datasets["train"][2:6]["context"], max_length=100, truncation="only_second", stride=50, return_overflowing_tokens=True, return_offsets_mapping=True, ) print(f"The 4 examples gave {len(inputs['input_ids'])} features.") print(f"Here is where each comes from: {inputs['overflow_to_sample_mapping']}.")
answers = raw_datasets["train"][2:6]["answers"] start_positions = [] end_positions = [] for i, offset in enumerate(inputs["offset_mapping"]): sample_idx = inputs["overflow_to_sample_mapping"][i] answer = answers[sample_idx] start_char = answer["answer_start"][0] end_char = answer["answer_start"][0] + len(answer["text"][0]) sequence_ids = inputs.sequence_ids(i) # Trouver le début et la fin du contexte idx = 0 while sequence_ids[idx] != 1: idx += 1 context_start = idx while sequence_ids[idx] == 1: idx += 1 context_end = idx - 1 # Si la réponse n'est pas entièrement dans le contexte, l'étiquette est (0, 0) if offset[context_start][0] > start_char or offset[context_end][1] < end_char: start_positions.append(0) end_positions.append(0) else: # Sinon, ce sont les positions de début et de fin du token idx = context_start while idx <= context_end and offset[idx][0] <= start_char: idx += 1 start_positions.append(idx - 1) idx = context_end while idx >= context_start and offset[idx][1] >= end_char: idx -= 1 end_positions.append(idx + 1) start_positions, end_positions
idx = 0 sample_idx = inputs["overflow_to_sample_mapping"][idx] answer = answers[sample_idx]["text"][0] start = start_positions[idx] end = end_positions[idx] labeled_answer = tokenizer.decode(inputs["input_ids"][idx][start : end + 1]) print(f"Theoretical answer: {answer}, labels give: {labeled_answer}")
idx = 4 sample_idx = inputs["overflow_to_sample_mapping"][idx] answer = answers[sample_idx]["text"][0] decoded_example = tokenizer.decode(inputs["input_ids"][idx]) print(f"Theoretical answer: {answer}, decoded example: {decoded_example}")
max_length = 384 stride = 128 def preprocess_training_examples(examples): questions = [q.strip() for q in examples["question"]] inputs = tokenizer( questions, examples["context"], max_length=max_length, truncation="only_second", stride=stride, return_overflowing_tokens=True, return_offsets_mapping=True, padding="max_length", ) offset_mapping = inputs.pop("offset_mapping") sample_map = inputs.pop("overflow_to_sample_mapping") answers = examples["answers"] start_positions = [] end_positions = [] for i, offset in enumerate(offset_mapping): sample_idx = sample_map[i] answer = answers[sample_idx] start_char = answer["answer_start"][0] end_char = answer["answer_start"][0] + len(answer["text"][0]) sequence_ids = inputs.sequence_ids(i) # Trouver le début et la fin du contexte idx = 0 while sequence_ids[idx] != 1: idx += 1 context_start = idx while sequence_ids[idx] == 1: idx += 1 context_end = idx - 1 # Si la réponse n'est pas entièrement dans le contexte, l'étiquette est (0, 0) if offset[context_start][0] > start_char or offset[context_end][1] < end_char: start_positions.append(0) end_positions.append(0) else: # Sinon, ce sont les positions de début et de fin du token idx = context_start while idx <= context_end and offset[idx][0] <= start_char: idx += 1 start_positions.append(idx - 1) idx = context_end while idx >= context_start and offset[idx][1] >= end_char: idx -= 1 end_positions.append(idx + 1) inputs["start_positions"] = start_positions inputs["end_positions"] = end_positions return inputs
train_dataset = raw_datasets["train"].map( preprocess_training_examples, batched=True, remove_columns=raw_datasets["train"].column_names, ) len(raw_datasets["train"]), len(train_dataset)
def preprocess_validation_examples(examples): questions = [q.strip() for q in examples["question"]] inputs = tokenizer( questions, examples["context"], max_length=max_length, truncation="only_second", stride=stride, return_overflowing_tokens=True, return_offsets_mapping=True, padding="max_length", ) sample_map = inputs.pop("overflow_to_sample_mapping") example_ids = [] for i in range(len(inputs["input_ids"])): sample_idx = sample_map[i] example_ids.append(examples["id"][sample_idx]) sequence_ids = inputs.sequence_ids(i) offset = inputs["offset_mapping"][i] inputs["offset_mapping"][i] = [ o if sequence_ids[k] == 1 else None for k, o in enumerate(offset) ] inputs["example_id"] = example_ids return inputs
validation_dataset = raw_datasets["test"].map( preprocess_validation_examples, batched=True, remove_columns=raw_datasets["test"].column_names, ) len(raw_datasets["test"]), len(validation_dataset)
small_eval_set = raw_datasets["test"].select(range(100)) trained_checkpoint = "distilbert-base-cased-distilled-squad" tokenizer = AutoTokenizer.from_pretrained(trained_checkpoint) eval_set = small_eval_set.map( preprocess_validation_examples, batched=True, remove_columns=raw_datasets["test"].column_names, )
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
import torch from transformers import AutoModelForQuestionAnswering eval_set_for_model = eval_set.remove_columns(["example_id", "offset_mapping"]) eval_set_for_model.set_format("torch") device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu") batch = {k: eval_set_for_model[k].to(device) for k in eval_set_for_model.column_names} trained_model = AutoModelForQuestionAnswering.from_pretrained(trained_checkpoint).to( device ) with torch.no_grad(): outputs = trained_model(**batch)
start_logits = outputs.start_logits.cpu().numpy() end_logits = outputs.end_logits.cpu().numpy()
import collections example_to_features = collections.defaultdict(list) for idx, feature in enumerate(eval_set): example_to_features[feature["example_id"]].append(idx)
import numpy as np n_best = 20 max_answer_length = 30 predicted_answers = [] for example in small_eval_set: example_id = example["id"] context = example["context"] answers = [] for feature_index in example_to_features[example_id]: start_logit = start_logits[feature_index] end_logit = end_logits[feature_index] offsets = eval_set["offset_mapping"][feature_index] start_indexes = np.argsort(start_logit)[-1 : -n_best - 1 : -1].tolist() end_indexes = np.argsort(end_logit)[-1 : -n_best - 1 : -1].tolist() for start_index in start_indexes: for end_index in end_indexes: # Ignorer les réponses qui ne sont pas entièrement dans le contexte if offsets[start_index] is None or offsets[end_index] is None: continue # Ignorer les réponses dont la longueur est soit < 0 soit > max_answer_length if ( end_index < start_index or end_index - start_index + 1 > max_answer_length ): continue answers.append( { "text": context[offsets[start_index][0] : offsets[end_index][1]], "logit_score": start_logit[start_index] + end_logit[end_index], } ) best_answer = max(answers, key=lambda x: x["logit_score"]) predicted_answers.append({"id": example_id, "prediction_text": best_answer["text"]})
from datasets import load_metric metric = load_metric("squad")
theoretical_answers = [ {"id": ex["id"], "answers": ex["answers"]} for ex in small_eval_set ]
print(predicted_answers[0]) print(theoretical_answers[0])
metric.compute(predictions=predicted_answers, references=theoretical_answers)
from tqdm.auto import tqdm def compute_metrics(start_logits, end_logits, features, examples): example_to_features = collections.defaultdict(list) for idx, feature in enumerate(features): example_to_features[feature["example_id"]].append(idx) predicted_answers = [] for example in tqdm(examples): example_id = example["id"] context = example["context"] answers = [] # Parcourir en boucle toutes les fonctionnalités associées à cet exemple for feature_index in example_to_features[example_id]: start_logit = start_logits[feature_index] end_logit = end_logits[feature_index] offsets = features[feature_index]["offset_mapping"] start_indexes = np.argsort(start_logit)[-1 : -n_best - 1 : -1].tolist() end_indexes = np.argsort(end_logit)[-1 : -n_best - 1 : -1].tolist() for start_index in start_indexes: for end_index in end_indexes: # Ignorez les réponses qui ne sont pas entièrement dans le contexte if offsets[start_index] is None or offsets[end_index] is None: continue # Sauter les réponses dont la longueur est soit < 0, soit > max_answer_length if ( end_index < start_index or end_index - start_index + 1 > max_answer_length ): continue answer = { "text": context[offsets[start_index][0] : offsets[end_index][1]], "logit_score": start_logit[start_index] + end_logit[end_index], } answers.append(answer) # Sélectionnez la réponse avec le meilleur score if len(answers) > 0: best_answer = max(answers, key=lambda x: x["logit_score"]) predicted_answers.append( {"id": example_id, "prediction_text": best_answer["text"]} ) else: predicted_answers.append({"id": example_id, "prediction_text": ""}) theoretical_answers = [{"id": ex["id"], "answers": ex["answers"]} for ex in examples] return metric.compute(predictions=predicted_answers, references=theoretical_answers)
compute_metrics(start_logits, end_logits, eval_set, small_eval_set)
model = AutoModelForQuestionAnswering.from_pretrained(model_checkpoint)
from transformers import TrainingArguments args = TrainingArguments( "camembert-base-finetuned-piaf", evaluation_strategy="no", save_strategy="epoch", learning_rate=2e-5, num_train_epochs=3, weight_decay=0.01, fp16=True, push_to_hub=True, )
from transformers import Trainer trainer = Trainer( model=model, args=args, train_dataset=train_dataset, eval_dataset=validation_dataset, tokenizer=tokenizer, ) trainer.train()
predictions, _, _ = trainer.predict(validation_dataset) start_logits, end_logits = predictions compute_metrics(start_logits, end_logits, validation_dataset, raw_datasets["test"])
trainer.push_to_hub(commit_message="Training complete")
from torch.utils.data import DataLoader from transformers import default_data_collator train_dataset.set_format("torch") validation_set = validation_dataset.remove_columns(["example_id", "offset_mapping"]) validation_set.set_format("torch") train_dataloader = DataLoader( train_dataset, shuffle=True, collate_fn=default_data_collator, batch_size=8, ) eval_dataloader = DataLoader( validation_set, collate_fn=default_data_collator, batch_size=8 )
model = AutoModelForQuestionAnswering.from_pretrained(model_checkpoint)
from torch.optim import AdamW optimizer = AdamW(model.parameters(), lr=2e-5)
from accelerate import Accelerator accelerator = Accelerator(fp16=True) model, optimizer, train_dataloader, eval_dataloader = accelerator.prepare( model, optimizer, train_dataloader, eval_dataloader )
from transformers import get_scheduler num_train_epochs = 3 num_update_steps_per_epoch = len(train_dataloader) num_training_steps = num_train_epochs * num_update_steps_per_epoch lr_scheduler = get_scheduler( "linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps, )
from huggingface_hub import Repository, get_full_repo_name model_name = "camembert-base-finetuned-piaf-accelerate" repo_name = get_full_repo_name(model_name) repo_name
output_dir = "camembert-base-finetuned-piaf-accelerate" repo = Repository(output_dir, clone_from=repo_name)
from tqdm.auto import tqdm import torch progress_bar = tqdm(range(num_training_steps)) for epoch in range(num_train_epochs): # Entraînement model.train() for step, batch in enumerate(train_dataloader): outputs = model(**batch) loss = outputs.loss accelerator.backward(loss) optimizer.step() lr_scheduler.step() optimizer.zero_grad() progress_bar.update(1) # Evaluation model.eval() start_logits = [] end_logits = [] accelerator.print("Evaluation!") for batch in tqdm(eval_dataloader): with torch.no_grad(): outputs = model(**batch) start_logits.append(accelerator.gather(outputs.start_logits).cpu().numpy()) end_logits.append(accelerator.gather(outputs.end_logits).cpu().numpy()) start_logits = np.concatenate(start_logits) end_logits = np.concatenate(end_logits) start_logits = start_logits[: len(validation_dataset)] end_logits = end_logits[: len(validation_dataset)] metrics = compute_metrics( start_logits, end_logits, validation_dataset, raw_datasets["test"] ) print(f"epoch {epoch}:", metrics) # Sauvegarder et télécharger accelerator.wait_for_everyone() unwrapped_model = accelerator.unwrap_model(model) unwrapped_model.save_pretrained(output_dir, save_function=accelerator.save) if accelerator.is_main_process: tokenizer.save_pretrained(output_dir) repo.push_to_hub( commit_message=f"Training in progress epoch {epoch}", blocking=False )
accelerator.wait_for_everyone() unwrapped_model = accelerator.unwrap_model(model) unwrapped_model.save_pretrained(output_dir, save_function=accelerator.save)
from transformers import pipeline # Remplacez par votre propre checkpoint model_checkpoint = "huggingface-course/camembert-finetuned-piaf" question_answerer = pipeline("question-answering", model=model_checkpoint) context = """ 🤗 Transformers est soutenu par les trois bibliothèques d'apprentissage profond les plus populaires - Jax, PyTorch et TensorFlow - avec une intégration transparente entre elles. Il est simple d'entraîner vos modèles avec l'une avant de les charger pour l'inférence avec l'autre. """ question = "Quelles sont les bibliothèques d'apprentissage profond derrière 🤗 Transformers ?" question_answerer(question=question, context=context)