Real-time collaboration for Jupyter Notebooks, Linux Terminals, LaTeX, VS Code, R IDE, and more,
all in one place. Commercial Alternative to JupyterHub.
Real-time collaboration for Jupyter Notebooks, Linux Terminals, LaTeX, VS Code, R IDE, and more,
all in one place. Commercial Alternative to JupyterHub.
Path: blob/main/course/fr/chapter7/section4_pt.ipynb
Views: 2555
Kernel: Python 3
Traduction (PyTorch)
Installez les bibliothèques 🤗 Datasets et 🤗 Transformers pour exécuter ce notebook.
In [ ]:
!pip install datasets transformers[sentencepiece] !pip install accelerate # Pour exécuter l'entraînement sur TPU, vous devez décommenter la ligne suivante : # !pip install cloud-tpu-client==0.10 torch==1.9.0 https://storage.googleapis.com/tpu-pytorch/wheels/torch_xla-1.9-cp37-cp37m-linux_x86_64.whl !apt install git-lfs
Vous aurez besoin de configurer git, adaptez votre email et votre nom dans la cellule suivante.
In [ ]:
Vous devrez également être connecté au Hub d'Hugging Face. Exécutez ce qui suit et entrez vos informations d'identification.
In [ ]:
from huggingface_hub import notebook_login notebook_login()
In [ ]:
from datasets import load_dataset, load_metric raw_datasets = load_dataset("kde4", lang1="en", lang2="fr")
In [ ]:
raw_datasets
In [ ]:
split_datasets = raw_datasets["train"].train_test_split(train_size=0.9, seed=20) split_datasets
In [ ]:
split_datasets["validation"] = split_datasets.pop("test")
In [ ]:
split_datasets["train"][1]["translation"]
In [ ]:
from transformers import pipeline model_checkpoint = "Helsinki-NLP/opus-mt-en-fr" translator = pipeline("translation", model=model_checkpoint) translator("Default to expanded threads")
In [ ]:
split_datasets["train"][172]["translation"]
In [ ]:
translator( "Unable to import %1 using the OFX importer plugin. This file is not the correct format." )
In [ ]:
from transformers import AutoTokenizer model_checkpoint = "Helsinki-NLP/opus-mt-en-fr" tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, return_tensors="tf")
In [ ]:
en_sentence = split_datasets["train"][1]["translation"]["en"] fr_sentence = split_datasets["train"][1]["translation"]["fr"] inputs = tokenizer(en_sentence) with tokenizer.as_target_tokenizer(): targets = tokenizer(fr_sentence)
In [ ]:
wrong_targets = tokenizer(fr_sentence) print(tokenizer.convert_ids_to_tokens(wrong_targets["input_ids"])) print(tokenizer.convert_ids_to_tokens(targets["input_ids"]))
In [ ]:
max_input_length = 128 max_target_length = 128 def preprocess_function(examples): inputs = [ex["en"] for ex in examples["translation"]] targets = [ex["fr"] for ex in examples["translation"]] model_inputs = tokenizer(inputs, max_length=max_input_length, truncation=True) # Configurer le tokenizer pour les cibles with tokenizer.as_target_tokenizer(): labels = tokenizer(targets, max_length=max_target_length, truncation=True) model_inputs["labels"] = labels["input_ids"] return model_inputs
In [ ]:
tokenized_datasets = split_datasets.map( preprocess_function, batched=True, remove_columns=split_datasets["train"].column_names, )
In [ ]:
from transformers import AutoModelForSeq2SeqLM model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)
In [ ]:
from transformers import DataCollatorForSeq2Seq data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)
In [ ]:
batch = data_collator([tokenized_datasets["train"][i] for i in range(1, 3)]) batch.keys()
In [ ]:
batch["labels"]
In [ ]:
batch["decoder_input_ids"]
In [ ]:
for i in range(1, 3): print(tokenized_datasets["train"][i]["labels"])
In [ ]:
!pip install sacrebleu
In [ ]:
from datasets import load_metric metric = load_metric("sacrebleu")
In [ ]:
predictions = [ "This plugin lets you translate web pages between several languages automatically." ] references = [ [ "This plugin allows you to automatically translate web pages between several languages." ] ] metric.compute(predictions=predictions, references=references)
In [ ]:
predictions = ["This This This This"] references = [ [ "This plugin allows you to automatically translate web pages between several languages." ] ] metric.compute(predictions=predictions, references=references)
In [ ]:
predictions = ["This plugin"] references = [ [ "This plugin allows you to automatically translate web pages between several languages." ] ] metric.compute(predictions=predictions, references=references)
In [ ]:
import numpy as np def compute_metrics(eval_preds): preds, labels = eval_preds # Dans le cas où le modèle retourne plus que les logits de prédiction if isinstance(preds, tuple): preds = preds[0] decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True) # Remplacer les -100 dans les étiquettes car nous ne pouvons pas les décoder labels = np.where(labels != -100, labels, tokenizer.pad_token_id) decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True) # Quelques post-traitements simples decoded_preds = [pred.strip() for pred in decoded_preds] decoded_labels = [[label.strip()] for label in decoded_labels] result = metric.compute(predictions=decoded_preds, references=decoded_labels) return {"bleu": result["score"]}
In [ ]:
from huggingface_hub import notebook_login notebook_login()
In [ ]:
from transformers import Seq2SeqTrainingArguments args = Seq2SeqTrainingArguments( f"marian-finetuned-kde4-en-to-fr", evaluation_strategy="no", save_strategy="epoch", learning_rate=2e-5, per_device_train_batch_size=32, per_device_eval_batch_size=64, weight_decay=0.01, save_total_limit=3, num_train_epochs=3, predict_with_generate=True, fp16=True, push_to_hub=True, )
In [ ]:
from transformers import Seq2SeqTrainer trainer = Seq2SeqTrainer( model, args, train_dataset=tokenized_datasets["train"], eval_dataset=tokenized_datasets["validation"], data_collator=data_collator, tokenizer=tokenizer, compute_metrics=compute_metrics, )
In [ ]:
trainer.evaluate(max_length=max_target_length)
In [ ]:
trainer.train()
In [ ]:
trainer.evaluate(max_length=max_target_length)
In [ ]:
trainer.push_to_hub(tags="translation", commit_message="Training complete")
In [ ]:
from torch.utils.data import DataLoader tokenized_datasets.set_format("torch") train_dataloader = DataLoader( tokenized_datasets["train"], shuffle=True, collate_fn=data_collator, batch_size=8, ) eval_dataloader = DataLoader( tokenized_datasets["validation"], collate_fn=data_collator, batch_size=8 )
In [ ]:
model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)
In [ ]:
from transformers import AdamW optimizer = AdamW(model.parameters(), lr=2e-5)
In [ ]:
from accelerate import Accelerator accelerator = Accelerator() model, optimizer, train_dataloader, eval_dataloader = accelerator.prepare( model, optimizer, train_dataloader, eval_dataloader )
In [ ]:
from transformers import get_scheduler num_train_epochs = 3 num_update_steps_per_epoch = len(train_dataloader) num_training_steps = num_train_epochs * num_update_steps_per_epoch lr_scheduler = get_scheduler( "linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps, )
In [ ]:
from huggingface_hub import Repository, get_full_repo_name model_name = "marian-finetuned-kde4-en-to-fr-accelerate" repo_name = get_full_repo_name(model_name) repo_name
In [ ]:
output_dir = "marian-finetuned-kde4-en-to-fr-accelerate" repo = Repository(output_dir, clone_from=repo_name)
In [ ]:
def postprocess(predictions, labels): predictions = predictions.cpu().numpy() labels = labels.cpu().numpy() decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True) # Remplacez -100 dans les étiquettes car nous ne pouvons pas les décoder labels = np.where(labels != -100, labels, tokenizer.pad_token_id) decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True) # Quelques post-traitements simples decoded_preds = [pred.strip() for pred in decoded_preds] decoded_labels = [[label.strip()] for label in decoded_labels] return decoded_preds, decoded_labels
In [ ]:
from tqdm.auto import tqdm import torch progress_bar = tqdm(range(num_training_steps)) for epoch in range(num_train_epochs): # Entraînement model.train() for batch in train_dataloader: outputs = model(**batch) loss = outputs.loss accelerator.backward(loss) optimizer.step() lr_scheduler.step() optimizer.zero_grad() progress_bar.update(1) # Evaluation model.eval() for batch in tqdm(eval_dataloader): with torch.no_grad(): generated_tokens = accelerator.unwrap_model(model).generate( batch["input_ids"], attention_mask=batch["attention_mask"], max_length=128, ) labels = batch["labels"] # Nécessaire pour rembourrer les prédictions et les étiquettes à rassembler generated_tokens = accelerator.pad_across_processes( generated_tokens, dim=1, pad_index=tokenizer.pad_token_id ) labels = accelerator.pad_across_processes(labels, dim=1, pad_index=-100) predictions_gathered = accelerator.gather(generated_tokens) labels_gathered = accelerator.gather(labels) decoded_preds, decoded_labels = postprocess(predictions_gathered, labels_gathered) metric.add_batch(predictions=decoded_preds, references=decoded_labels) results = metric.compute() print(f"epoch {epoch}, BLEU score: {results['score']:.2f}") # Sauvegarder et télécharger accelerator.wait_for_everyone() unwrapped_model = accelerator.unwrap_model(model) unwrapped_model.save_pretrained(output_dir, save_function=accelerator.save) if accelerator.is_main_process: tokenizer.save_pretrained(output_dir) repo.push_to_hub( commit_message=f"Training in progress epoch {epoch}", blocking=False )
In [ ]:
from transformers import pipeline # Remplacer par votre propre checkpoint model_checkpoint = "huggingface-course/marian-finetuned-kde4-en-to-fr" translator = pipeline("translation", model=model_checkpoint) translator("Default to expanded threads")
In [ ]:
translator( "Unable to import %1 using the OFX importer plugin. This file is not the correct format." )