Path: blob/main/course/en/chapter3/section7.ipynb
4548 views
Kernel: Python 3
In [ ]:
# Install Pytorch & other libraries %pip install -qqq torch torchvision setuptools scikit-learn # Install Hugging Face libraries %pip install --upgrade datasets -qqq accelerate hf-transfer transformers
In [ ]:
from datasets import load_dataset # Dataset id from huggingface.co/dataset dataset_id = "burtenshaw/PleIAs_common_corpus_code_classification" # Load raw dataset dataset = load_dataset(dataset_id)
In [ ]:
print(len(dataset["train"])) print(dataset["train"][0])
In [ ]:
from transformers import AutoTokenizer # Model id to load the tokenizer model_id = "answerdotai/ModernBERT-base" # Load Tokenizer tokenizer = AutoTokenizer.from_pretrained(model_id) # Tokenize helper function def tokenize(batch): return tokenizer(batch['text'], padding=True, truncation=True, return_tensors="pt") # Tokenize dataset tokenized_dataset = dataset.map(tokenize, batched=True, remove_columns=["text"]) tokenized_dataset["train"].features.keys() # dict_keys(['labels', 'input_ids', 'attention_mask'])
In [ ]:
from transformers import AutoModelForSequenceClassification # Model id to load the tokenizer model_id = "answerdotai/ModernBERT-base" # Prepare model labels - useful for inference labels = list(set(tokenized_dataset["train"]["labels"])) num_labels = len(labels) label2id, id2label = dict(), dict() for i, label in enumerate(labels): label2id[label] = str(i) id2label[str(i)] = label
In [ ]:
# Download the model from huggingface.co/models model = AutoModelForSequenceClassification.from_pretrained( model_id, num_labels=num_labels, label2id=label2id, id2label=id2label, )
In [ ]:
import numpy as np from sklearn.metrics import f1_score # Metric helper method def compute_metrics(eval_pred): predictions, labels = eval_pred predictions = np.argmax(predictions, axis=1) score = f1_score( labels, predictions, labels=labels, pos_label=1, average="weighted" ) return {"f1": float(score) if score == 1 else score}
In [ ]:
from huggingface_hub import HfFolder from transformers import Trainer, TrainingArguments # Define training args training_args = TrainingArguments( output_dir= "ModernBERT-code-classifier", per_device_train_batch_size=1, per_device_eval_batch_size=1, gradient_accumulation_steps=16, learning_rate=5e-5, num_train_epochs=5, bf16=True, # bfloat16 training optim="adamw_torch_fused", # improved optimizer # logging & evaluation strategies logging_strategy="steps", logging_steps=100, eval_strategy="epoch", save_strategy="epoch", save_total_limit=2, load_best_model_at_end=True, metric_for_best_model="f1", # push to hub parameters push_to_hub=True, hub_strategy="every_save", hub_token=HfFolder.get_token(), report_to="wandb" )
Overfitting
In [ ]:
limited_dataset = tokenized_dataset["train"].select(range(100)) # Create a Trainer instance trainer = Trainer( model=model, args=training_args, train_dataset=limited_dataset, eval_dataset=tokenized_dataset["test"], compute_metrics=compute_metrics, ) trainer.train()
In [ ]:
# clear memory import torch torch.cuda.empty_cache() del trainer del model del limited_dataset
Underfitting
In [ ]:
# define a low learning rate training_args.learning_rate = 1e-7 # Create a Trainer instance trainer = Trainer( model=model, args=training_args, train_dataset=limited_dataset, eval_dataset=tokenized_dataset["test"], compute_metrics=compute_metrics, ) trainer.train()
In [ ]:
# clear memory import torch torch.cuda.empty_cache() del trainer del model
Just right! 🥣
In [ ]:
# define a valid learning rate training_args.learning_rate = 5e-5 # Create a Trainer instance trainer = Trainer( model=model, args=training_args, train_dataset=limited_dataset, eval_dataset=tokenized_dataset["test"], compute_metrics=compute_metrics, ) trainer.train()
Inference
In [ ]:
from transformers import pipeline # load model from huggingface.co/models using our repository id classifier = pipeline( task="text-classification", model="argilla/ModernBERT-domain-classifier", device=0, ) sample = """def add_numbers(a, b): return a + b""" classifier(sample)