Real-time collaboration for Jupyter Notebooks, Linux Terminals, LaTeX, VS Code, R IDE, and more,
all in one place. Commercial Alternative to JupyterHub.
Real-time collaboration for Jupyter Notebooks, Linux Terminals, LaTeX, VS Code, R IDE, and more,
all in one place. Commercial Alternative to JupyterHub.
Path: blob/main/course/fr/chapter5/section3.ipynb
Views: 2555
Kernel: Python 3
Il est temps de trancher et de découper
Installez les bibliothèques 🤗 Transformers et 🤗 Datasets pour exécuter ce notebook.
In [ ]:
!pip install datasets evaluate transformers[sentencepiece]
In [ ]:
!wget "https://archive.ics.uci.edu/ml/machine-learning-databases/00462/drugsCom_raw.zip" !unzip drugsCom_raw.zip
In [ ]:
from datasets import load_dataset data_files = {"train": "drugsComTrain_raw.tsv", "test": "drugsComTest_raw.tsv"} # \t est le caractère de tabulation en Python drug_dataset = load_dataset("csv", data_files=data_files, delimiter="\t")
In [ ]:
drug_sample = drug_dataset["train"].shuffle(seed=42).select(range(1000)) # Un coup d'œil sur les premiers exemples drug_sample[:3]
In [ ]:
for split in drug_dataset.keys(): assert len(drug_dataset[split]) == len(drug_dataset[split].unique("Unnamed: 0"))
In [ ]:
drug_dataset = drug_dataset.rename_column( original_column_name="Unnamed: 0", new_column_name="patient_id" ) drug_dataset
In [ ]:
def lowercase_condition(example): return {"condition": example["condition"].lower()} drug_dataset.map(lowercase_condition)
In [ ]:
def filter_nones(x): return x["condition"] is not None
In [ ]:
(lambda x: x * x)(3)
In [ ]:
(lambda base, height: 0.5 * base * height)(4, 8)
In [ ]:
drug_dataset = drug_dataset.filter(lambda x: x["condition"] is not None)
In [ ]:
drug_dataset = drug_dataset.map(lowercase_condition) # Vérification que la mise en minuscule a fonctionné drug_dataset["train"]["condition"][:3]
In [ ]:
def compute_review_length(example): return {"review_length": len(example["review"].split())}
In [ ]:
drug_dataset = drug_dataset.map(compute_review_length) # Inspecter le premier exemple d'entraînement drug_dataset["train"][0]
In [ ]:
drug_dataset["train"].sort("review_length")[:3]
In [ ]:
drug_dataset = drug_dataset.filter(lambda x: x["review_length"] > 30) print(drug_dataset.num_rows)
In [ ]:
import html text = "I'm a transformer called BERT" html.unescape(text)
In [ ]:
drug_dataset = drug_dataset.map(lambda x: {"review": html.unescape(x["review"])})
In [ ]:
new_drug_dataset = drug_dataset.map( lambda x: {"review": [html.unescape(o) for o in x["review"]]}, batched=True )
In [ ]:
from transformers import AutoTokenizer tokenizer = AutoTokenizer.from_pretrained("bert-base-cased") def tokenize_function(examples): return tokenizer(examples["review"], truncation=True)
In [ ]:
%time tokenized_dataset = drug_dataset.map(tokenize_function, batched=True)
In [ ]:
slow_tokenizer = AutoTokenizer.from_pretrained("bert-base-cased", use_fast=False) def slow_tokenize_function(examples): return slow_tokenizer(examples["review"], truncation=True) tokenized_dataset = drug_dataset.map(slow_tokenize_function, batched=True, num_proc=8)
In [ ]:
def tokenize_and_split(examples): return tokenizer( examples["review"], truncation=True, max_length=128, return_overflowing_tokens=True, )
In [ ]:
result = tokenize_and_split(drug_dataset["train"][0]) [len(inp) for inp in result["input_ids"]]
In [ ]:
tokenized_dataset = drug_dataset.map(tokenize_and_split, batched=True)
In [ ]:
tokenized_dataset = drug_dataset.map( tokenize_and_split, batched=True, remove_columns=drug_dataset["train"].column_names )
In [ ]:
len(tokenized_dataset["train"]), len(drug_dataset["train"])
In [ ]:
def tokenize_and_split(examples): result = tokenizer( examples["review"], truncation=True, max_length=128, return_overflowing_tokens=True, ) # Extraire la correspondance entre les nouveaux et les anciens indices sample_map = result.pop("overflow_to_sample_mapping") for key, values in examples.items(): result[key] = [values[i] for i in sample_map] return result
In [ ]:
tokenized_dataset = drug_dataset.map(tokenize_and_split, batched=True) tokenized_dataset
In [ ]:
drug_dataset.set_format("pandas")
In [ ]:
drug_dataset["train"][:3]
In [ ]:
train_df = drug_dataset["train"][:]
In [ ]:
frequencies = ( train_df["condition"] .value_counts() .to_frame() .reset_index() .rename(columns={"index": "condition", "condition": "frequency"}) ) frequencies.head()
In [ ]:
from datasets import Dataset freq_dataset = Dataset.from_pandas(frequencies) freq_dataset
In [ ]:
drug_dataset.reset_format()
In [ ]:
drug_dataset_clean = drug_dataset["train"].train_test_split(train_size=0.8, seed=42) # Renommer la division par défaut "test" en "validation" drug_dataset_clean["validation"] = drug_dataset_clean.pop("test") # Ajoutez le jeu "test" à notre `DatasetDict` drug_dataset_clean["test"] = drug_dataset["test"] drug_dataset_clean
In [ ]:
drug_dataset_clean.save_to_disk("drug-reviews")
In [ ]:
from datasets import load_from_disk drug_dataset_reloaded = load_from_disk("drug-reviews") drug_dataset_reloaded
In [ ]:
for split, dataset in drug_dataset_clean.items(): dataset.to_json(f"drug-reviews-{split}.jsonl")
In [ ]:
!head -n 1 drug-reviews-train.jsonl
In [ ]:
data_files = { "train": "drug-reviews-train.jsonl", "validation": "drug-reviews-validation.jsonl", "test": "drug-reviews-test.jsonl", } drug_dataset_reloaded = load_dataset("json", data_files=data_files)