CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutSign UpSign In
huggingface

Real-time collaboration for Jupyter Notebooks, Linux Terminals, LaTeX, VS Code, R IDE, and more,
all in one place. Commercial Alternative to JupyterHub.

GitHub Repository: huggingface/notebooks
Path: blob/main/course/fr/chapter7/section5_pt.ipynb
Views: 2555
Kernel: Python 3

Résumé (PyTorch)

Installez les bibliothèques 🤗 Datasets et 🤗 Transformers pour exécuter ce notebook.

!pip install datasets transformers[sentencepiece] !pip install accelerate # Pour exécuter l'entraînement sur TPU, vous devez décommenter la ligne suivante : # !pip install cloud-tpu-client==0.10 torch==1.9.0 https://storage.googleapis.com/tpu-pytorch/wheels/torch_xla-1.9-cp37-cp37m-linux_x86_64.whl !apt install git-lfs

Vous aurez besoin de configurer git, adaptez votre email et votre nom dans la cellule suivante.

!git config --global user.email "[email protected]" !git config --global user.name "Your Name"

Vous devrez également être connecté au Hub d'Hugging Face. Exécutez ce qui suit et entrez vos informations d'identification.

from huggingface_hub import notebook_login notebook_login()
from datasets import load_dataset french_dataset = load_dataset("amazon_reviews_multi", "fr") english_dataset = load_dataset("amazon_reviews_multi", "en") french_dataset
def show_samples(dataset, num_samples=3, seed=42): sample = dataset["train"].shuffle(seed=seed).select(range(num_samples)) for example in sample: print(f"\n'>> Title: {example['review_title']}'") print(f"'>> Review: {example['review_body']}'") show_samples(french_dataset)
french_dataset.set_format("pandas") french_df = french_dataset["train"][:] # Afficher les comptes des 20 premiers produits french_df["product_category"].value_counts()[:20]
def filter_books(example): return ( example["product_category"] == "book" or example["product_category"] == "digital_ebook_purchase" )
french_dataset.reset_format()
french_books = french_dataset.filter(filter_books) english_books = english_dataset.filter(filter_books) show_samples(french_dataset)
from datasets import concatenate_datasets, DatasetDict books_dataset = DatasetDict() for split in english_books.keys(): books_dataset[split] = concatenate_datasets( [english_books[split], french_books[split]] ) books_dataset[split] = books_dataset[split].shuffle(seed=42) # Quelques exemples show_samples(books_dataset)
books_dataset = books_dataset.filter(lambda x: len(x["review_title"].split()) > 2)
from transformers import AutoTokenizer model_checkpoint = "google/mt5-small" tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
inputs = tokenizer("J'ai adoré lire les Hunger Games !") inputs
tokenizer.convert_ids_to_tokens(inputs.input_ids)
max_input_length = 512 max_target_length = 30 def preprocess_function(examples): model_inputs = tokenizer( examples["review_body"], max_length=max_input_length, truncation=True ) # Configurer le tokenizer pour les cibles with tokenizer.as_target_tokenizer(): labels = tokenizer( examples["review_title"], max_length=max_target_length, truncation=True ) model_inputs["labels"] = labels["input_ids"] return model_inputs
tokenized_datasets = books_dataset.map(preprocess_function, batched=True)
generated_summary = "J'ai absolument adoré lire les Hunger Games" reference_summary = "J'ai adoré lire les Hunger Games"
!pip install rouge_score
from datasets import load_metric rouge_score = load_metric("rouge")
scores = rouge_score.compute( predictions=[generated_summary], references=[reference_summary] ) scores
scores["rouge1"].mid
!pip install nltk
import nltk nltk.download("punkt")
from nltk.tokenize import sent_tokenize def three_sentence_summary(text): return "\n".join(sent_tokenize(text)[:3]) print(three_sentence_summary(books_dataset["train"][1]["review_body"]))
def evaluate_baseline(dataset, metric): summaries = [three_sentence_summary(text) for text in dataset["review_body"]] return metric.compute(predictions=summaries, references=dataset["review_title"])
import pandas as pd score = evaluate_baseline(books_dataset["validation"], rouge_score) rouge_names = ["rouge1", "rouge2", "rougeL", "rougeLsum"] rouge_dict = dict((rn, round(score[rn].mid.fmeasure * 100, 2)) for rn in rouge_names) rouge_dict
from transformers import AutoModelForSeq2SeqLM model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)
from transformers import Seq2SeqTrainingArguments batch_size = 8 num_train_epochs = 8 # Montre la perte d'entraînement à chaque époque logging_steps = len(tokenized_datasets["train"]) // batch_size model_name = model_checkpoint.split("/")[-1] args = Seq2SeqTrainingArguments( output_dir=f"{model_name}-finetuned-amazon-en-fr", evaluation_strategy="epoch", learning_rate=5.6e-5, per_device_train_batch_size=batch_size, per_device_eval_batch_size=batch_size, weight_decay=0.01, save_total_limit=3, num_train_epochs=num_train_epochs, predict_with_generate=True, logging_steps=logging_steps, push_to_hub=True, )
import numpy as np def compute_metrics(eval_pred): predictions, labels = eval_pred # Décoder les résumés générés en texte decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True) # Remplacer -100 dans les étiquettes car nous ne pouvons pas les décoder labels = np.where(labels != -100, labels, tokenizer.pad_token_id) # Décoder les résumés de référence en texte decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True) # ROUGE attend une nouvelle ligne après chaque phrase decoded_preds = ["\n".join(sent_tokenize(pred.strip())) for pred in decoded_preds] decoded_labels = ["\n".join(sent_tokenize(label.strip())) for label in decoded_labels] # Calculer les scores ROUGE result = rouge_score.compute( predictions=decoded_preds, references=decoded_labels, use_stemmer=True ) # Extraire les scores médians result = {key: value.mid.fmeasure * 100 for key, value in result.items()} return {k: round(v, 4) for k, v in result.items()}
from transformers import DataCollatorForSeq2Seq data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)
tokenized_datasets = tokenized_datasets.remove_columns( books_dataset["train"].column_names )
features = [tokenized_datasets["train"][i] for i in range(2)] data_collator(features)
from transformers import Seq2SeqTrainer trainer = Seq2SeqTrainer( model, args, train_dataset=tokenized_datasets["train"], eval_dataset=tokenized_datasets["validation"], data_collator=data_collator, tokenizer=tokenizer, compute_metrics=compute_metrics, )
trainer.train()
trainer.evaluate()
tokenized_datasets.set_format("torch")
model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)
from torch.utils.data import DataLoader batch_size = 8 train_dataloader = DataLoader( tokenized_datasets["train"], shuffle=True, collate_fn=data_collator, batch_size=batch_size, ) eval_dataloader = DataLoader( tokenized_datasets["validation"], collate_fn=data_collator, batch_size=batch_size )
from torch.optim import AdamW optimizer = AdamW(model.parameters(), lr=2e-5)
from accelerate import Accelerator accelerator = Accelerator() model, optimizer, train_dataloader, eval_dataloader = accelerator.prepare( model, optimizer, train_dataloader, eval_dataloader )
from transformers import get_scheduler num_train_epochs = 10 num_update_steps_per_epoch = len(train_dataloader) num_training_steps = num_train_epochs * num_update_steps_per_epoch lr_scheduler = get_scheduler( "linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps, )
def postprocess_text(preds, labels): preds = [pred.strip() for pred in preds] labels = [label.strip() for label in labels] # ROUGE attend une nouvelle ligne après chaque phrase preds = ["\n".join(nltk.sent_tokenize(pred)) for pred in preds] labels = ["\n".join(nltk.sent_tokenize(label)) for label in labels] return preds, labels
from huggingface_hub import get_full_repo_name model_name = "test-bert-finetuned-squad-accelerate" repo_name = get_full_repo_name(model_name) repo_name
from huggingface_hub import Repository output_dir = "results-mt5-finetuned-squad-accelerate" repo = Repository(output_dir, clone_from=repo_name)
from tqdm.auto import tqdm import torch import numpy as np progress_bar = tqdm(range(num_training_steps)) for epoch in range(num_train_epochs): # Entraînement model.train() for step, batch in enumerate(train_dataloader): outputs = model(**batch) loss = outputs.loss accelerator.backward(loss) optimizer.step() lr_scheduler.step() optimizer.zero_grad() progress_bar.update(1) # Evaluation model.eval() for step, batch in enumerate(eval_dataloader): with torch.no_grad(): generated_tokens = accelerator.unwrap_model(model).generate( batch["input_ids"], attention_mask=batch["attention_mask"], ) generated_tokens = accelerator.pad_across_processes( generated_tokens, dim=1, pad_index=tokenizer.pad_token_id ) labels = batch["labels"] # Si nous n'avons pas rempli la longueur maximale, nous devons également remplir les étiquettes labels = accelerator.pad_across_processes( batch["labels"], dim=1, pad_index=tokenizer.pad_token_id ) generated_tokens = accelerator.gather(generated_tokens).cpu().numpy() labels = accelerator.gather(labels).cpu().numpy() # Remplacer -100 dans les étiquettes car nous ne pouvons pas les décoder labels = np.where(labels != -100, labels, tokenizer.pad_token_id) if isinstance(generated_tokens, tuple): generated_tokens = generated_tokens[0] decoded_preds = tokenizer.batch_decode( generated_tokens, skip_special_tokens=True ) decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True) decoded_preds, decoded_labels = postprocess_text( decoded_preds, decoded_labels ) rouge_score.add_batch(predictions=decoded_preds, references=decoded_labels) # Calculer les métriques result = rouge_score.compute() # Extraire les scores médians de ROUGE result = {key: value.mid.fmeasure * 100 for key, value in result.items()} result = {k: round(v, 4) for k, v in result.items()} print(f"Epoch {epoch}:", result) # Sauvegarder et télécharger accelerator.wait_for_everyone() unwrapped_model = accelerator.unwrap_model(model) unwrapped_model.save_pretrained(output_dir, save_function=accelerator.save) if accelerator.is_main_process: tokenizer.save_pretrained(output_dir) repo.push_to_hub( commit_message=f"Training in progress epoch {epoch}", blocking=False )
from transformers import pipeline hub_model_id = "huggingface-course/mt5-small-finetuned-amazon-en-fr" summarizer = pipeline("summarization", model=hub_model_id)
def print_summary(idx): review = books_dataset["test"][idx]["review_body"] title = books_dataset["test"][idx]["review_title"] summary = summarizer(books_dataset["test"][idx]["review_body"])[0]["summary_text"] print(f"'>>> Review: {review}'") print(f"\n'>>> Title: {title}'") print(f"\n'>>> Summary: {summary}'")
print_summary(100)
print_summary(0)