Real-time collaboration for Jupyter Notebooks, Linux Terminals, LaTeX, VS Code, R IDE, and more,
all in one place. Commercial Alternative to JupyterHub.
Real-time collaboration for Jupyter Notebooks, Linux Terminals, LaTeX, VS Code, R IDE, and more,
all in one place. Commercial Alternative to JupyterHub.
Path: blob/main/course/fr/chapter7/section5_tf.ipynb
Views: 2555
Kernel: Python 3
Résumé (TensorFlow)
Installez les bibliothèques 🤗 Datasets et 🤗 Transformers pour exécuter ce notebook.
In [ ]:
!pip install datasets transformers[sentencepiece] !apt install git-lfs
Vous aurez besoin de configurer git, adaptez votre email et votre nom dans la cellule suivante.
In [ ]:
Vous devrez également être connecté au Hub d'Hugging Face. Exécutez ce qui suit et entrez vos informations d'identification.
In [ ]:
from huggingface_hub import notebook_login notebook_login()
In [ ]:
from datasets import load_dataset french_dataset = load_dataset("amazon_reviews_multi", "fr") english_dataset = load_dataset("amazon_reviews_multi", "en") french_dataset
In [ ]:
def show_samples(dataset, num_samples=3, seed=42): sample = dataset["train"].shuffle(seed=seed).select(range(num_samples)) for example in sample: print(f"\n'>> Title: {example['review_title']}'") print(f"'>> Review: {example['review_body']}'") show_samples(french_dataset)
In [ ]:
french_dataset.set_format("pandas") french_df = french_dataset["train"][:] # Afficher les comptes des 20 premiers produits french_df["product_category"].value_counts()[:20]
In [ ]:
def filter_books(example): return ( example["product_category"] == "book" or example["product_category"] == "digital_ebook_purchase" )
In [ ]:
french_dataset.reset_format()
In [ ]:
french_books = french_dataset.filter(filter_books) english_books = english_dataset.filter(filter_books) show_samples(french_dataset)
In [ ]:
from datasets import concatenate_datasets, DatasetDict books_dataset = DatasetDict() for split in english_books.keys(): books_dataset[split] = concatenate_datasets( [english_books[split], french_books[split]] ) books_dataset[split] = books_dataset[split].shuffle(seed=42) # Quelques exemples show_samples(books_dataset)
In [ ]:
books_dataset = books_dataset.filter(lambda x: len(x["review_title"].split()) > 2)
In [ ]:
from transformers import AutoTokenizer model_checkpoint = "google/mt5-small" tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
In [ ]:
inputs = tokenizer("J'ai adoré lire les Hunger Games !") inputs
In [ ]:
tokenizer.convert_ids_to_tokens(inputs.input_ids)
In [ ]:
max_input_length = 512 max_target_length = 30 def preprocess_function(examples): model_inputs = tokenizer( examples["review_body"], max_length=max_input_length, truncation=True ) # Configurer le tokenizer pour les cibles with tokenizer.as_target_tokenizer(): labels = tokenizer( examples["review_title"], max_length=max_target_length, truncation=True ) model_inputs["labels"] = labels["input_ids"] return model_inputs
In [ ]:
tokenized_datasets = books_dataset.map(preprocess_function, batched=True)
In [ ]:
generated_summary = "J'ai absolument adoré lire les Hunger Games" reference_summary = "J'ai adoré lire les Hunger Games"
In [ ]:
!pip install rouge_score
In [ ]:
from datasets import load_metric rouge_score = load_metric("rouge")
In [ ]:
scores = rouge_score.compute( predictions=[generated_summary], references=[reference_summary] ) scores
In [ ]:
scores["rouge1"].mid
In [ ]:
!pip install nltk
In [ ]:
import nltk nltk.download("punkt")
In [ ]:
from nltk.tokenize import sent_tokenize def three_sentence_summary(text): return "\n".join(sent_tokenize(text)[:3]) print(three_sentence_summary(books_dataset["train"][1]["review_body"]))
In [ ]:
def evaluate_baseline(dataset, metric): summaries = [three_sentence_summary(text) for text in dataset["review_body"]] return metric.compute(predictions=summaries, references=dataset["review_title"])
In [ ]:
import pandas as pd score = evaluate_baseline(books_dataset["validation"], rouge_score) rouge_names = ["rouge1", "rouge2", "rougeL", "rougeLsum"] rouge_dict = dict((rn, round(score[rn].mid.fmeasure * 100, 2)) for rn in rouge_names) rouge_dict
In [ ]:
from transformers import TFAutoModelForSeq2SeqLM model = TFAutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)
In [ ]:
from transformers import DataCollatorForSeq2Seq data_collator = DataCollatorForSeq2Seq(tokenizer, model=model, return_tensors="tf")
In [ ]:
tokenized_datasets = tokenized_datasets.remove_columns( books_dataset["train"].column_names )
In [ ]:
features = [tokenized_datasets["train"][i] for i in range(2)] data_collator(features)
In [ ]:
tf_train_dataset = model.prepare_tf_dataset( tokenized_datasets["train"], collate_fn=data_collator, shuffle=True, batch_size=8, ) tf_eval_dataset = model.prepare_tf_dataset( tokenized_datasets["validation"], collate_fn=data_collator, shuffle=False, batch_size=8)
In [ ]:
from transformers import create_optimizer import tensorflow as tf # Le nombre d'étapes d'entraînement est le nombre d'échantillons dans le jeu de données, divisé par la taille du batch puis multiplié # par le nombre total d'époques. Notez que le jeu de données tf_train_dataset est ici un lot de données tf.data.Dataset, # pas le jeu de données original Hugging Face, donc son len() est déjà num_samples // batch_size. num_train_epochs = 8 num_train_steps = len(tf_train_dataset) * num_train_epochs model_name = model_checkpoint.split("/")[-1] optimizer, schedule = create_optimizer( init_lr=5.6e-5, num_warmup_steps=0, num_train_steps=num_train_steps, weight_decay_rate=0.01, ) model.compile(optimizer=optimizer) # Entraîner en mixed-precision float16 tf.keras.mixed_precision.set_global_policy("mixed_float16")
In [ ]:
tf.config.run_functions_eagerly(True)
In [ ]:
In [ ]:
from transformers.keras_callbacks import PushToHubCallback callback = PushToHubCallback( output_dir=f"{model_name}-finetuned-amazon-en-fr", tokenizer=tokenizer ) model.fit( tf_train_dataset, validation_data=tf_eval_dataset, callbacks=[callback], epochs=2 )
In [ ]:
from tqdm import tqdm import numpy as np generation_data_collator = DataCollatorForSeq2Seq( tokenizer, model=model, return_tensors="tf", pad_to_multiple_of=320 ) tf_generate_dataset = model.prepare_tf_dataset( tokenized_datasets["validation"], collate_fn=generation_data_collator, shuffle=False, batch_size=8, drop_remainder=True, ) @tf.function(jit_compile=True) def generate_with_xla(batch): return model.generate( input_ids=batch["input_ids"], attention_mask=batch["attention_mask"], max_new_tokens=32, ) all_preds = [] all_labels = [] for batch, labels in tqdm(tf_generate_dataset): predictions = generate_with_xla(batch) decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True) labels = labels.numpy() labels = np.where(labels != -100, labels, tokenizer.pad_token_id) decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True) decoded_preds = ["\n".join(sent_tokenize(pred.strip())) for pred in decoded_preds] decoded_labels = ["\n".join(sent_tokenize(label.strip())) for label in decoded_labels] all_preds.extend(decoded_preds) all_labels.extend(decoded_labels)
In [ ]:
result = rouge_score.compute( predictions=decoded_preds, references=decoded_labels, use_stemmer=True ) result = {key: value.mid.fmeasure * 100 for key, value in result.items()} {k: round(v, 4) for k, v in result.items()}
In [ ]:
result
In [ ]:
from transformers import pipeline hub_model_id = "huggingface-course/mt5-small-finetuned-amazon-en-fr" summarizer = pipeline("summarization", model=hub_model_id)
In [ ]:
def print_summary(idx): review = books_dataset["test"][idx]["review_body"] title = books_dataset["test"][idx]["review_title"] summary = summarizer(books_dataset["test"][idx]["review_body"])[0]["summary_text"] print(f"'>>> Review: {review}'") print(f"\n'>>> Title: {title}'") print(f"\n'>>> Summary: {summary}'")
In [ ]:
print_summary(100)
In [ ]:
print_summary(0)