Real-time collaboration for Jupyter Notebooks, Linux Terminals, LaTeX, VS Code, R IDE, and more,
all in one place. Commercial Alternative to JupyterHub.
Real-time collaboration for Jupyter Notebooks, Linux Terminals, LaTeX, VS Code, R IDE, and more,
all in one place. Commercial Alternative to JupyterHub.
Path: blob/main/course/fr/chapter7/section7_tf.ipynb
Views: 2555
Kernel: Python 3
Réponses aux questions (TensorFlow)
Installez les bibliothèques Transformers et Datasets pour exécuter ce notebook.
In [ ]:
!pip install datasets transformers[sentencepiece] !apt install git-lfs
Vous aurez besoin de configurer git, adaptez votre email et votre nom dans la cellule suivante.
In [ ]:
Vous devrez également être connecté au Hub d'Hugging Face. Exécutez ce qui suit et entrez vos informations d'identification.
In [ ]:
from huggingface_hub import notebook_login notebook_login()
In [ ]:
from datasets import load_dataset raw_datasets = load_dataset("piaf") # piaf n'ayant pas de jeu de données de validation, nous en créons un raw_datasets = raw_datasets['train'] raw_datasets = raw_datasets.train_test_split(test_size=0.2, shuffle=True)
In [ ]:
raw_datasets
In [ ]:
print("Context: ", raw_datasets["train"][0]["context"]) print("Question: ", raw_datasets["train"][0]["question"]) print("Answer: ", raw_datasets["train"][0]["answers"])
In [ ]:
raw_datasets["train"].filter(lambda x: len(x["answers"]["text"]) != 1)
In [ ]:
print(raw_datasets["test"][0]["answers"]) print(raw_datasets["test"][2]["answers"])
In [ ]:
print(raw_datasets["test"][2]["context"]) print(raw_datasets["test"][2]["question"])
In [ ]:
from transformers import AutoTokenizer model_checkpoint = "camembert-base" tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
In [ ]:
tokenizer.is_fast
In [ ]:
context = raw_datasets["train"][0]["context"] question = raw_datasets["train"][0]["question"] inputs = tokenizer(question, context) tokenizer.decode(inputs["input_ids"])
In [ ]:
inputs = tokenizer( question, context, max_length=100, truncation="only_second", stride=50, return_overflowing_tokens=True, ) for ids in inputs["input_ids"]: print(tokenizer.decode(ids))
In [ ]:
inputs = tokenizer( question, context, max_length=100, truncation="only_second", stride=50, return_overflowing_tokens=True, return_offsets_mapping=True, ) inputs.keys()
In [ ]:
inputs["overflow_to_sample_mapping"]
In [ ]:
inputs = tokenizer( raw_datasets["train"][2:6]["question"], raw_datasets["train"][2:6]["context"], max_length=100, truncation="only_second", stride=50, return_overflowing_tokens=True, return_offsets_mapping=True, ) print(f"The 4 examples gave {len(inputs['input_ids'])} features.") print(f"Here is where each comes from: {inputs['overflow_to_sample_mapping']}.")
In [ ]:
answers = raw_datasets["train"][2:6]["answers"] start_positions = [] end_positions = [] for i, offset in enumerate(inputs["offset_mapping"]): sample_idx = inputs["overflow_to_sample_mapping"][i] answer = answers[sample_idx] start_char = answer["answer_start"][0] end_char = answer["answer_start"][0] + len(answer["text"][0]) sequence_ids = inputs.sequence_ids(i) # Trouver le début et la fin du contexte idx = 0 while sequence_ids[idx] != 1: idx += 1 context_start = idx while sequence_ids[idx] == 1: idx += 1 context_end = idx - 1 # Si la réponse n'est pas entièrement dans le contexte, l'étiquette est (0, 0) if offset[context_start][0] > start_char or offset[context_end][1] < end_char: start_positions.append(0) end_positions.append(0) else: # Sinon, ce sont les positions de début et de fin du token idx = context_start while idx <= context_end and offset[idx][0] <= start_char: idx += 1 start_positions.append(idx - 1) idx = context_end while idx >= context_start and offset[idx][1] >= end_char: idx -= 1 end_positions.append(idx + 1) start_positions, end_positions
In [ ]:
idx = 0 sample_idx = inputs["overflow_to_sample_mapping"][idx] answer = answers[sample_idx]["text"][0] start = start_positions[idx] end = end_positions[idx] labeled_answer = tokenizer.decode(inputs["input_ids"][idx][start : end + 1]) print(f"Theoretical answer: {answer}, labels give: {labeled_answer}")
In [ ]:
idx = 4 sample_idx = inputs["overflow_to_sample_mapping"][idx] answer = answers[sample_idx]["text"][0] decoded_example = tokenizer.decode(inputs["input_ids"][idx]) print(f"Theoretical answer: {answer}, decoded example: {decoded_example}")
In [ ]:
max_length = 384 stride = 128 def preprocess_training_examples(examples): questions = [q.strip() for q in examples["question"]] inputs = tokenizer( questions, examples["context"], max_length=max_length, truncation="only_second", stride=stride, return_overflowing_tokens=True, return_offsets_mapping=True, padding="max_length", ) offset_mapping = inputs.pop("offset_mapping") sample_map = inputs.pop("overflow_to_sample_mapping") answers = examples["answers"] start_positions = [] end_positions = [] for i, offset in enumerate(offset_mapping): sample_idx = sample_map[i] answer = answers[sample_idx] start_char = answer["answer_start"][0] end_char = answer["answer_start"][0] + len(answer["text"][0]) sequence_ids = inputs.sequence_ids(i) # Trouver le début et la fin du contexte idx = 0 while sequence_ids[idx] != 1: idx += 1 context_start = idx while sequence_ids[idx] == 1: idx += 1 context_end = idx - 1 # Si la réponse n'est pas entièrement dans le contexte, l'étiquette est (0, 0) if offset[context_start][0] > start_char or offset[context_end][1] < end_char: start_positions.append(0) end_positions.append(0) else: # Sinon, ce sont les positions de début et de fin du token idx = context_start while idx <= context_end and offset[idx][0] <= start_char: idx += 1 start_positions.append(idx - 1) idx = context_end while idx >= context_start and offset[idx][1] >= end_char: idx -= 1 end_positions.append(idx + 1) inputs["start_positions"] = start_positions inputs["end_positions"] = end_positions return inputs
In [ ]:
train_dataset = raw_datasets["train"].map( preprocess_training_examples, batched=True, remove_columns=raw_datasets["train"].column_names, ) len(raw_datasets["train"]), len(train_dataset)
In [ ]:
def preprocess_validation_examples(examples): questions = [q.strip() for q in examples["question"]] inputs = tokenizer( questions, examples["context"], max_length=max_length, truncation="only_second", stride=stride, return_overflowing_tokens=True, return_offsets_mapping=True, padding="max_length", ) sample_map = inputs.pop("overflow_to_sample_mapping") example_ids = [] for i in range(len(inputs["input_ids"])): sample_idx = sample_map[i] example_ids.append(examples["id"][sample_idx]) sequence_ids = inputs.sequence_ids(i) offset = inputs["offset_mapping"][i] inputs["offset_mapping"][i] = [ o if sequence_ids[k] == 1 else None for k, o in enumerate(offset) ] inputs["example_id"] = example_ids return inputs
In [ ]:
validation_dataset = raw_datasets["test"].map( preprocess_validation_examples, batched=True, remove_columns=raw_datasets["test"].column_names, ) len(raw_datasets["test"]), len(validation_dataset)
In [ ]:
small_eval_set = raw_datasets["test"].select(range(100)) trained_checkpoint = "etalab-ia/camembert-base-squadFR-fquad-piaf" tokenizer = AutoTokenizer.from_pretrained(trained_checkpoint) eval_set = small_eval_set.map( preprocess_validation_examples, batched=True, remove_columns=raw_datasets["test"].column_names, )
In [ ]:
import tensorflow as tf from transformers import TFAutoModelForQuestionAnswering eval_set_for_model = eval_set.remove_columns(["example_id", "offset_mapping"]) eval_set_for_model.set_format("numpy") batch = {k: eval_set_for_model[k] for k in eval_set_for_model.column_names} trained_model = TFAutoModelForQuestionAnswering.from_pretrained(trained_checkpoint) outputs = trained_model(**batch)
In [ ]:
start_logits = outputs.start_logits.numpy() end_logits = outputs.end_logits.numpy()
In [ ]:
import collections example_to_features = collections.defaultdict(list) for idx, feature in enumerate(eval_set): example_to_features[feature["example_id"]].append(idx)
In [ ]:
import numpy as np n_best = 20 max_answer_length = 30 predicted_answers = [] for example in small_eval_set: example_id = example["id"] context = example["context"] answers = [] for feature_index in example_to_features[example_id]: start_logit = start_logits[feature_index] end_logit = end_logits[feature_index] offsets = eval_set["offset_mapping"][feature_index] start_indexes = np.argsort(start_logit)[-1 : -n_best - 1 : -1].tolist() end_indexes = np.argsort(end_logit)[-1 : -n_best - 1 : -1].tolist() for start_index in start_indexes: for end_index in end_indexes: # Ignorez les réponses qui ne sont pas entièrement dans le contexte if offsets[start_index] is None or offsets[end_index] is None: continue # Ignorer les réponses dont la longueur est soit < 0 soit > max_answer_length if ( end_index < start_index or end_index - start_index + 1 > max_answer_length ): continue answers.append( { "text": context[offsets[start_index][0] : offsets[end_index][1]], "logit_score": start_logit[start_index] + end_logit[end_index], } ) best_answer = max(answers, key=lambda x: x["logit_score"]) predicted_answers.append({"id": example_id, "prediction_text": best_answer["text"]})
In [ ]:
from datasets import load_metric metric = load_metric("squad")
In [ ]:
theoretical_answers = [ {"id": ex["id"], "answers": ex["answers"]} for ex in small_eval_set ]
In [ ]:
print(predicted_answers[0]) print(theoretical_answers[0])
In [ ]:
metric.compute(predictions=predicted_answers, references=theoretical_answers)
In [ ]:
from tqdm.auto import tqdm def compute_metrics(start_logits, end_logits, features, examples): example_to_features = collections.defaultdict(list) for idx, feature in enumerate(features): example_to_features[feature["example_id"]].append(idx) predicted_answers = [] for example in tqdm(examples): example_id = example["id"] context = example["context"] answers = [] # Parcourir en boucle toutes les fonctionnalités associées à cet exemple for feature_index in example_to_features[example_id]: start_logit = start_logits[feature_index] end_logit = end_logits[feature_index] offsets = features[feature_index]["offset_mapping"] start_indexes = np.argsort(start_logit)[-1 : -n_best - 1 : -1].tolist() end_indexes = np.argsort(end_logit)[-1 : -n_best - 1 : -1].tolist() for start_index in start_indexes: for end_index in end_indexes: # Ignorez les réponses qui ne sont pas entièrement dans le contexte if offsets[start_index] is None or offsets[end_index] is None: continue # Sauter les réponses dont la longueur est soit < 0, soit > max_answer_length if ( end_index < start_index or end_index - start_index + 1 > max_answer_length ): continue answer = { "text": context[offsets[start_index][0] : offsets[end_index][1]], "logit_score": start_logit[start_index] + end_logit[end_index], } answers.append(answer) # Sélectionnez la réponse avec le meilleur score if len(answers) > 0: best_answer = max(answers, key=lambda x: x["logit_score"]) predicted_answers.append( {"id": example_id, "prediction_text": best_answer["text"]} ) else: predicted_answers.append({"id": example_id, "prediction_text": ""}) theoretical_answers = [{"id": ex["id"], "answers": ex["answers"]} for ex in examples] return metric.compute(predictions=predicted_answers, references=theoretical_answers)
In [ ]:
compute_metrics(start_logits, end_logits, eval_set, small_eval_set)
In [ ]:
model = TFAutoModelForQuestionAnswering.from_pretrained(model_checkpoint)
In [ ]:
from transformers import DefaultDataCollator data_collator = DefaultDataCollator(return_tensors="tf")
In [ ]:
tf_train_dataset = model.prepare_tf_dataset( train_dataset, collate_fn=data_collator, shuffle=True, batch_size=16) tf_eval_dataset = model.prepare_tf_dataset( validation_dataset, collate_fn=data_collator, shuffle=False, batch_size=16)
In [ ]:
from transformers import create_optimizer from transformers.keras_callbacks import PushToHubCallback import tensorflow as tf # Le nombre d'étapes d'entraînement est le nombre d'échantillons dans le jeu de données, divisé par la taille du batch puis multiplié # par le nombre total d'époques. Notez que le jeu de données tf_train_dataset est ici un batch de données tf.data.Dataset, # pas le jeu de données original Hugging Face, donc son len() est déjà num_samples // batch_size. num_train_epochs = 3 num_train_steps = len(tf_train_dataset) * num_train_epochs optimizer, schedule = create_optimizer( init_lr=2e-5, num_warmup_steps=0, num_train_steps=num_train_steps, weight_decay_rate=0.01, ) model.compile(optimizer=optimizer) # Entraîner en mixed-precision float16 tf.keras.mixed_precision.set_global_policy("mixed_float16")
In [ ]:
from transformers.keras_callbacks import PushToHubCallback callback = PushToHubCallback(output_dir="camembert-base-finetuned-piaf", tokenizer=tokenizer) # Nous allons faire la validation après, donc pas de validation au milieu de l'entraînement model.fit(tf_train_dataset, callbacks=[callback], epochs=num_train_epochs)
In [ ]:
predictions = model.predict(tf_eval_dataset) compute_metrics( predictions["start_logits"], predictions["end_logits"], validation_dataset, raw_datasets["test"], )
In [ ]:
from transformers import pipeline # Remplacez par votre propre checkpoint model_checkpoint = "huggingface-course/camembert-finetuned-piaf" question_answerer = pipeline("question-answering", model=model_checkpoint) context = """ 🤗 Transformers est soutenu par les trois bibliothèques d'apprentissage profond les plus populaires - Jax, PyTorch et TensorFlow - avec une intégration transparente entre elles. Il est simple d'entraîner vos modèles avec l'une avant de les charger pour l'inférence avec l'autre. """ question = "Quelles sont les bibliothèques d'apprentissage profond derrière 🤗 Transformers ?" question_answerer(question=question, context=context)