Real-time collaboration for Jupyter Notebooks, Linux Terminals, LaTeX, VS Code, R IDE, and more,
all in one place. Commercial Alternative to JupyterHub.
Real-time collaboration for Jupyter Notebooks, Linux Terminals, LaTeX, VS Code, R IDE, and more,
all in one place. Commercial Alternative to JupyterHub.
Path: blob/main/course/fr/chapter7/section3_pt.ipynb
Views: 2555
Kernel: Python 3
Finetuner un modèle de language masqué (PyTorch)
Installez les bibliothèques 🤗 Datasets, 🤗 Transformers et 🤗 Accelerate pour exécuter ce notebook.
In [ ]:
!pip install datasets transformers[sentencepiece] !pip install accelerate # Pour exécuter l'entraînement sur TPU, vous devez décommenter la ligne suivante : # !pip install cloud-tpu-client==0.10 torch==1.9.0 https://storage.googleapis.com/tpu-pytorch/wheels/torch_xla-1.9-cp37-cp37m-linux_x86_64.whl !apt install git-lfs
Vous aurez besoin de configurer git, adaptez votre email et votre nom dans la cellule suivante.
In [ ]:
Vous devrez également être connecté au Hub d'Hugging Face. Exécutez ce qui suit et entrez vos informations d'identification.
In [ ]:
from huggingface_hub import notebook_login notebook_login()
In [ ]:
from transformers import AutoModelForMaskedLM model_checkpoint = "camembert-base" model = AutoModelForMaskedLM.from_pretrained(model_checkpoint)
In [ ]:
text = "C'est une grande <mask>."
In [ ]:
from transformers import AutoTokenizer tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
In [ ]:
import torch inputs = tokenizer(text, return_tensors="pt") token_logits = model(**inputs).logits # Trouver l'emplacement du <mask> et extraire ses logits mask_token_index = torch.where(inputs["input_ids"] == tokenizer.mask_token_id)[1] mask_token_logits = token_logits[0, mask_token_index, :] # Choisir les <mask> candidats avec les logits les plus élevés top_5_tokens = torch.topk(mask_token_logits, 5, dim=1).indices[0].tolist() for token in top_5_tokens: print(f"'>>> {text.replace(tokenizer.mask_token, tokenizer.decode([token]))}'")
In [ ]:
from datasets import load_dataset imdb_dataset = load_dataset("allocine") imdb_dataset
In [ ]:
sample = imdb_dataset["train"].shuffle(seed=42).select(range(3)) for row in sample: print(f"\n'>>> Review: {row['review']}'") print(f"'>>> Label: {row['label']}'")
In [ ]:
def tokenize_function(examples): result = tokenizer(examples["review"]) if tokenizer.is_fast: result["word_ids"] = [result.word_ids(i) for i in range(len(result["input_ids"]))] return result # Utilisez batched=True pour activer le multithreading rapide ! tokenized_datasets = imdb_dataset.map( tokenize_function, batched=True, remove_columns=["review", "label"] ) tokenized_datasets
In [ ]:
tokenizer.model_max_length
In [ ]:
chunk_size = 128
In [ ]:
# Le découpage produit une liste de listes pour chaque caractéristique tokenized_samples = tokenized_datasets["train"][:3] for idx, sample in enumerate(tokenized_samples["input_ids"]): print(f"'>>> Review {idx} length: {len(sample)}'")
In [ ]:
concatenated_examples = { k: sum(tokenized_samples[k], []) for k in tokenized_samples.keys() } total_length = len(concatenated_examples["input_ids"]) print(f"'>>> Concatenated reviews length: {total_length}'")
In [ ]:
chunks = { k: [t[i : i + chunk_size] for i in range(0, total_length, chunk_size)] for k, t in concatenated_examples.items() } for chunk in chunks["input_ids"]: print(f"'>>> Chunk length: {len(chunk)}'")
In [ ]:
def group_texts(examples): # Concaténation de tous les textes concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()} # Calculer la longueur des textes concaténés total_length = len(concatenated_examples[list(examples.keys())[0]]) # Nous laissons tomber le dernier morceau s'il est plus petit que chunk_size total_length = (total_length // chunk_size) * chunk_size # Fractionnement par morceaux de max_len result = { k: [t[i : i + chunk_size] for i in range(0, total_length, chunk_size)] for k, t in concatenated_examples.items() } # Créer une nouvelle colonne d'étiquettes result["labels"] = result["input_ids"].copy() return result
In [ ]:
lm_datasets = tokenized_datasets.map(group_texts, batched=True) lm_datasets
In [ ]:
tokenizer.decode(lm_datasets["train"][1]["input_ids"])
In [ ]:
from transformers import DataCollatorForLanguageModeling data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm_probability=0.15)
In [ ]:
samples = [lm_datasets["train"][i] for i in range(2)] for sample in samples: _ = sample.pop("word_ids") for chunk in data_collator(samples)["input_ids"]: print(f"\n'>>> {tokenizer.decode(chunk)}'")
In [ ]:
import collections import numpy as np from transformers import default_data_collator wwm_probability = 0.2 def whole_word_masking_data_collator(features): for feature in features: word_ids = feature.pop("word_ids") # Création d'une correspondance entre les mots et les indices des tokens correspondants mapping = collections.defaultdict(list) current_word_index = -1 current_word = None for idx, word_id in enumerate(word_ids): if word_id is not None: if word_id != current_word: current_word = word_id current_word_index += 1 mapping[current_word_index].append(idx) # Masquer des mots de façon aléatoire mask = np.random.binomial(1, wwm_probability, (len(mapping),)) input_ids = feature["input_ids"] labels = feature["labels"] new_labels = [-100] * len(labels) for word_id in np.where(mask)[0]: word_id = word_id.item() for idx in mapping[word_id]: new_labels[idx] = labels[idx] input_ids[idx] = tokenizer.mask_token_id feature["labels"] = new_labels return default_data_collator(features)
In [ ]:
samples = [lm_datasets["train"][i] for i in range(2)] batch = whole_word_masking_data_collator(samples) for chunk in batch["input_ids"]: print(f"\n'>>> {tokenizer.decode(chunk)}'")
In [ ]:
train_size = 10_000 test_size = int(0.1 * train_size) downsampled_dataset = lm_datasets["train"].train_test_split( train_size=train_size, test_size=test_size, seed=42 ) downsampled_dataset
In [ ]:
from transformers import TrainingArguments batch_size = 64 # Montrer la perte d'entraînement à chaque époque logging_steps = len(downsampled_dataset["train"]) // batch_size model_name = model_checkpoint.split("/")[-1] training_args = TrainingArguments( output_dir=f"{model_name}-finetuned-allocine", overwrite_output_dir=True, evaluation_strategy="epoch", learning_rate=2e-5, weight_decay=0.01, per_device_train_batch_size=batch_size, per_device_eval_batch_size=batch_size, push_to_hub=True, fp16=True, logging_steps=logging_steps, )
In [ ]:
from transformers import Trainer trainer = Trainer( model=model, args=training_args, train_dataset=downsampled_dataset["train"], eval_dataset=downsampled_dataset["test"], data_collator=data_collator, tokenizer=tokenizer, )
In [ ]:
import math eval_results = trainer.evaluate() print(f">>> Perplexity: {math.exp(eval_results['eval_loss']):.2f}")
In [ ]:
trainer.train()
In [ ]:
eval_results = trainer.evaluate() print(f">>> Perplexity: {math.exp(eval_results['eval_loss']):.2f}")
In [ ]:
trainer.push_to_hub()
In [ ]:
def insert_random_mask(batch): features = [dict(zip(batch, t)) for t in zip(*batch.values())] masked_inputs = data_collator(features) # Créer une nouvelle colonne "masquée" pour chaque colonne du jeu de données return {"masked_" + k: v.numpy() for k, v in masked_inputs.items()}
In [ ]:
downsampled_dataset = downsampled_dataset.remove_columns(["word_ids"]) eval_dataset = downsampled_dataset["test"].map( insert_random_mask, batched=True, remove_columns=downsampled_dataset["test"].column_names, ) eval_dataset = eval_dataset.rename_columns( { "masked_input_ids": "input_ids", "masked_attention_mask": "attention_mask", "masked_labels": "labels", } )
In [ ]:
from torch.utils.data import DataLoader from transformers import default_data_collator batch_size = 64 train_dataloader = DataLoader( downsampled_dataset["train"], shuffle=True, batch_size=batch_size, collate_fn=data_collator, ) eval_dataloader = DataLoader( eval_dataset, batch_size=batch_size, collate_fn=default_data_collator )
In [ ]:
from torch.optim import AdamW optimizer = AdamW(model.parameters(), lr=5e-5)
In [ ]:
from accelerate import Accelerator accelerator = Accelerator() model, optimizer, train_dataloader, eval_dataloader = accelerator.prepare( model, optimizer, train_dataloader, eval_dataloader )
In [ ]:
from transformers import get_scheduler num_train_epochs = 3 num_update_steps_per_epoch = len(train_dataloader) num_training_steps = num_train_epochs * num_update_steps_per_epoch lr_scheduler = get_scheduler( "linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps, )
In [ ]:
from huggingface_hub import get_full_repo_name model_name = "camembert-base-finetuned-allocine-accelerate" repo_name = get_full_repo_name(model_name) repo_name
In [ ]:
from huggingface_hub import Repository output_dir = model_name repo = Repository(output_dir, clone_from=repo_name)
In [ ]:
from tqdm.auto import tqdm import torch import math progress_bar = tqdm(range(num_training_steps)) for epoch in range(num_train_epochs): # Entraînement model.train() for batch in train_dataloader: outputs = model(**batch) loss = outputs.loss accelerator.backward(loss) optimizer.step() lr_scheduler.step() optimizer.zero_grad() progress_bar.update(1) # Evaluation model.eval() losses = [] for step, batch in enumerate(eval_dataloader): with torch.no_grad(): outputs = model(**batch) loss = outputs.loss losses.append(accelerator.gather(loss.repeat(batch_size))) losses = torch.cat(losses) losses = losses[: len(eval_dataset)] try: perplexity = math.exp(torch.mean(losses)) except OverflowError: perplexity = float("inf") print(f">>> Epoch {epoch}: Perplexity: {perplexity}") # Sauvegarder et télécharger accelerator.wait_for_everyone() unwrapped_model = accelerator.unwrap_model(model) unwrapped_model.save_pretrained(output_dir, save_function=accelerator.save) if accelerator.is_main_process: tokenizer.save_pretrained(output_dir) repo.push_to_hub( commit_message=f"Training in progress epoch {epoch}", blocking=False )
In [ ]:
from transformers import pipeline mask_filler = pipeline( "fill-mask", model="huggingface-course/camembert-base-finetuned-allocine", tokenizer="huggingface-course/camembert-base-finetuned-allocine", )
In [ ]:
preds = mask_filler(text) for pred in preds: print(f">>> {pred['sequence']}")