CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutSign UpSign In
huggingface

Real-time collaboration for Jupyter Notebooks, Linux Terminals, LaTeX, VS Code, R IDE, and more,
all in one place. Commercial Alternative to JupyterHub.

GitHub Repository: huggingface/notebooks
Path: blob/main/course/fr/chapter8/section4.ipynb
Views: 2555
Kernel: Python 3

Déboguer le pipeline d'entraînement

Ce chapitre portant sur le débogage, la langue nous importe peu ici. Nous nous intéressons surtout à la logique du code pour comprendre d'où provient l'erreur.

Installez les bibliothèques 🤗 Transformers et 🤗 Datasets pour exécuter ce notebook.

!pip install datasets transformers[sentencepiece]
from datasets import load_dataset, load_metric from transformers import ( AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer, ) raw_datasets = load_dataset("glue", "mnli") model_checkpoint = "distilbert-base-uncased" tokenizer = AutoTokenizer.from_pretrained(model_checkpoint) def preprocess_function(examples): return tokenizer(examples["premise"], examples["hypothesis"], truncation=True) tokenized_datasets = raw_datasets.map(preprocess_function, batched=True) model = AutoModelForSequenceClassification.from_pretrained(model_checkpoint) args = TrainingArguments( f"distilbert-finetuned-mnli", evaluation_strategy="epoch", save_strategy="epoch", learning_rate=2e-5, num_train_epochs=3, weight_decay=0.01, ) metric = load_metric("glue", "mnli") def compute_metrics(eval_pred): predictions, labels = eval_pred return metric.compute(predictions=predictions, references=labels) trainer = Trainer( model, args, train_dataset=raw_datasets["train"], eval_dataset=raw_datasets["validation_matched"], compute_metrics=compute_metrics, ) trainer.train()
trainer.train_dataset[0]
from datasets import load_dataset, load_metric from transformers import ( AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer, ) raw_datasets = load_dataset("glue", "mnli") model_checkpoint = "distilbert-base-uncased" tokenizer = AutoTokenizer.from_pretrained(model_checkpoint) def preprocess_function(examples): return tokenizer(examples["premise"], examples["hypothesis"], truncation=True) tokenized_datasets = raw_datasets.map(preprocess_function, batched=True) model = AutoModelForSequenceClassification.from_pretrained(model_checkpoint) args = TrainingArguments( f"distilbert-finetuned-mnli", evaluation_strategy="epoch", save_strategy="epoch", learning_rate=2e-5, num_train_epochs=3, weight_decay=0.01, ) metric = load_metric("glue", "mnli") def compute_metrics(eval_pred): predictions, labels = eval_pred return metric.compute(predictions=predictions, references=labels) trainer = Trainer( model, args, train_dataset=tokenized_datasets["train"], eval_dataset=tokenized_datasets["validation_matched"], compute_metrics=compute_metrics, ) trainer.train()
tokenizer.decode(trainer.train_dataset[0]["input_ids"])
trainer.train_dataset[0].keys()
type(trainer.model)
trainer.train_dataset[0]["attention_mask"]
len(trainer.train_dataset[0]["attention_mask"]) == len( trainer.train_dataset[0]["input_ids"] )
trainer.train_dataset[0]["label"]
trainer.train_dataset.features["label"].names
for batch in trainer.get_train_dataloader(): break
data_collator = trainer.get_train_dataloader().collate_fn data_collator
from datasets import load_dataset, load_metric from transformers import ( AutoTokenizer, AutoModelForSequenceClassification, DataCollatorWithPadding, TrainingArguments, Trainer, ) raw_datasets = load_dataset("glue", "mnli") model_checkpoint = "distilbert-base-uncased" tokenizer = AutoTokenizer.from_pretrained(model_checkpoint) def preprocess_function(examples): return tokenizer(examples["premise"], examples["hypothesis"], truncation=True) tokenized_datasets = raw_datasets.map(preprocess_function, batched=True) model = AutoModelForSequenceClassification.from_pretrained(model_checkpoint) args = TrainingArguments( f"distilbert-finetuned-mnli", evaluation_strategy="epoch", save_strategy="epoch", learning_rate=2e-5, num_train_epochs=3, weight_decay=0.01, ) metric = load_metric("glue", "mnli") def compute_metrics(eval_pred): predictions, labels = eval_pred return metric.compute(predictions=predictions, references=labels) data_collator = DataCollatorWithPadding(tokenizer=tokenizer) trainer = Trainer( model, args, train_dataset=tokenized_datasets["train"], eval_dataset=tokenized_datasets["validation_matched"], compute_metrics=compute_metrics, data_collator=data_collator, tokenizer=tokenizer, ) trainer.train()
data_collator = trainer.get_train_dataloader().collate_fn batch = data_collator([trainer.train_dataset[i] for i in range(4)])
data_collator = trainer.get_train_dataloader().collate_fn actual_train_set = trainer._remove_unused_columns(trainer.train_dataset) batch = data_collator([actual_train_set[i] for i in range(4)])
for batch in trainer.get_train_dataloader(): break
outputs = trainer.model.cpu()(**batch)
trainer.model.config.num_labels
from datasets import load_dataset, load_metric from transformers import ( AutoTokenizer, AutoModelForSequenceClassification, DataCollatorWithPadding, TrainingArguments, Trainer, ) raw_datasets = load_dataset("glue", "mnli") model_checkpoint = "distilbert-base-uncased" tokenizer = AutoTokenizer.from_pretrained(model_checkpoint) def preprocess_function(examples): return tokenizer(examples["premise"], examples["hypothesis"], truncation=True) tokenized_datasets = raw_datasets.map(preprocess_function, batched=True) model = AutoModelForSequenceClassification.from_pretrained(model_checkpoint, num_labels=3) args = TrainingArguments( f"distilbert-finetuned-mnli", evaluation_strategy="epoch", save_strategy="epoch", learning_rate=2e-5, num_train_epochs=3, weight_decay=0.01, ) metric = load_metric("glue", "mnli") def compute_metrics(eval_pred): predictions, labels = eval_pred return metric.compute(predictions=predictions, references=labels) data_collator = DataCollatorWithPadding(tokenizer=tokenizer) trainer = Trainer( model, args, train_dataset=tokenized_datasets["train"], eval_dataset=tokenized_datasets["validation_matched"], compute_metrics=compute_metrics, data_collator=data_collator, tokenizer=tokenizer, )
for batch in trainer.get_train_dataloader(): break outputs = trainer.model.cpu()(**batch)
import torch device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu") batch = {k: v.to(device) for k, v in batch.items()} outputs = trainer.model.to(device)(**batch)
loss = outputs.loss loss.backward()
trainer.create_optimizer() trainer.optimizer.step()
# This will take a long time and error out, so you shouldn't run this cell trainer.train()
trainer.evaluate()
for batch in trainer.get_eval_dataloader(): break batch = {k: v.to(device) for k, v in batch.items()} with torch.no_grad(): outputs = trainer.model(**batch)
predictions = outputs.logits.cpu().numpy() labels = batch["labels"].cpu().numpy() compute_metrics((predictions, labels))
predictions.shape, labels.shape
import numpy as np def compute_metrics(eval_pred): predictions, labels = eval_pred predictions = np.argmax(predictions, axis=1) return metric.compute(predictions=predictions, references=labels) compute_metrics((predictions, labels))
import numpy as np from datasets import load_dataset, load_metric from transformers import ( AutoTokenizer, AutoModelForSequenceClassification, DataCollatorWithPadding, TrainingArguments, Trainer, ) raw_datasets = load_dataset("glue", "mnli") model_checkpoint = "distilbert-base-uncased" tokenizer = AutoTokenizer.from_pretrained(model_checkpoint) def preprocess_function(examples): return tokenizer(examples["premise"], examples["hypothesis"], truncation=True) tokenized_datasets = raw_datasets.map(preprocess_function, batched=True) model = AutoModelForSequenceClassification.from_pretrained(model_checkpoint, num_labels=3) args = TrainingArguments( f"distilbert-finetuned-mnli", evaluation_strategy="epoch", save_strategy="epoch", learning_rate=2e-5, num_train_epochs=3, weight_decay=0.01, ) metric = load_metric("glue", "mnli") def compute_metrics(eval_pred): predictions, labels = eval_pred predictions = np.argmax(predictions, axis=1) return metric.compute(predictions=predictions, references=labels) data_collator = DataCollatorWithPadding(tokenizer=tokenizer) trainer = Trainer( model, args, train_dataset=tokenized_datasets["train"], eval_dataset=tokenized_datasets["validation_matched"], compute_metrics=compute_metrics, data_collator=data_collator, tokenizer=tokenizer, ) trainer.train()
for batch in trainer.get_train_dataloader(): break batch = {k: v.to(device) for k, v in batch.items()} trainer.create_optimizer() for _ in range(20): outputs = trainer.model(**batch) loss = outputs.loss loss.backward() trainer.optimizer.step() trainer.optimizer.zero_grad()
with torch.no_grad(): outputs = trainer.model(**batch) preds = outputs.logits labels = batch["labels"] compute_metrics((preds.cpu().numpy(), labels.cpu().numpy()))