CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutSign UpSign In
huggingface

Real-time collaboration for Jupyter Notebooks, Linux Terminals, LaTeX, VS Code, R IDE, and more,
all in one place. Commercial Alternative to JupyterHub.

GitHub Repository: huggingface/notebooks
Path: blob/main/course/fr/chapter7/section6_tf.ipynb
Views: 2555
Kernel: Python 3

Entraîner un modèle de langage causal de zéro (TensorFlow)

Ici nous entraînons un modèle à générer du code Python. Le Python utilisant des fonctions basées sur des mots anglais, nous gardons un gpt-2 anglais dans l'optique d'obtenir de meilleures performances que ce que l'on pourrait s'attendre en utilisant un gpt-2 en français.

Installez les bibliothèques 🤗 Datasets et 🤗 Transformers pour exécuter ce notebook.

!pip install datasets transformers[sentencepiece] !apt install git-lfs

Vous aurez besoin de configurer git, adaptez votre email et votre nom dans la cellule suivante.

!git config --global user.email "[email protected]" !git config --global user.name "Your Name"

Vous devrez également être connecté au Hub d'Hugging Face. Exécutez ce qui suit et entrez vos informations d'identification.

from huggingface_hub import notebook_login notebook_login()
def any_keyword_in_string(string, keywords): for keyword in keywords: if keyword in string: return True return False
filters = ["pandas", "sklearn", "matplotlib", "seaborn"] example_1 = "import numpy as np" example_2 = "import pandas as pd" print( any_keyword_in_string(example_1, filters), any_keyword_in_string(example_2, filters) )
from collections import defaultdict from tqdm import tqdm from datasets import Dataset def filter_streaming_dataset(dataset, filters): filtered_dict = defaultdict(list) total = 0 for sample in tqdm(iter(dataset)): total += 1 if any_keyword_in_string(sample["content"], filters): for k, v in sample.items(): filtered_dict[k].append(v) print(f"{len(filtered_dict['content'])/total:.2%} of data after filtering.") return Dataset.from_dict(filtered_dict)
# Cette cellule prendra beaucoup de temps à s'exécuter, donc vous devriez la sauter et aller à la suivante ! from datasets import load_dataset split = "train" # "valid" filters = ["pandas", "sklearn", "matplotlib", "seaborn"] data = load_dataset(f"transformersbook/codeparrot-{split}", split=split, streaming=True) filtered_data = filter_streaming_dataset(data, filters)
from datasets import load_dataset, DatasetDict ds_train = load_dataset("huggingface-course/codeparrot-ds-train", split="train") ds_valid = load_dataset("huggingface-course/codeparrot-ds-valid", split="validation") raw_datasets = DatasetDict( { "train": ds_train, # .shuffle().select(range(50000)), "valid": ds_valid, # .shuffle().select(range(500)) } ) raw_datasets
for key in raw_datasets["train"][0]: print(f"{key.upper()}: {raw_datasets['train'][0][key][:200]}")
from transformers import AutoTokenizer context_length = 128 tokenizer = AutoTokenizer.from_pretrained("huggingface-course/code-search-net-tokenizer") outputs = tokenizer( raw_datasets["train"][:2]["content"], truncation=True, max_length=context_length, return_overflowing_tokens=True, return_length=True, ) print(f"Input IDs length: {len(outputs['input_ids'])}") print(f"Input chunk lengths: {(outputs['length'])}") print(f"Chunk mapping: {outputs['overflow_to_sample_mapping']}")
def tokenize(element): outputs = tokenizer( element["content"], truncation=True, max_length=context_length, return_overflowing_tokens=True, return_length=True, ) input_batch = [] for length, input_ids in zip(outputs["length"], outputs["input_ids"]): if length == context_length: input_batch.append(input_ids) return {"input_ids": input_batch} tokenized_datasets = raw_datasets.map( tokenize, batched=True, remove_columns=raw_datasets["train"].column_names ) tokenized_datasets
from transformers import AutoTokenizer, TFGPT2LMHeadModel, AutoConfig config = AutoConfig.from_pretrained( "gpt2", vocab_size=len(tokenizer), n_ctx=context_length, bos_token_id=tokenizer.bos_token_id, eos_token_id=tokenizer.eos_token_id, )
model = TFGPT2LMHeadModel(config) model(model.dummy_inputs) # Construit le modèle model.summary()
from transformers import DataCollatorForLanguageModeling tokenizer.pad_token = tokenizer.eos_token data_collator = DataCollatorForLanguageModeling(tokenizer, mlm=False, return_tensors="tf")
out = data_collator([tokenized_datasets["train"][i] for i in range(5)]) for key in out: print(f"{key} shape: {out[key].shape}")
tf_train_dataset = model.prepare_tf_dataset( train_dataset, collate_fn=data_collator, shuffle=True, batch_size=16, ) tf_eval_dataset = model.prepare_tf_dataset( tokenized_dataset["valid"], collate_fn=data_collator, shuffle=False, batch_size=32, )
from huggingface_hub import notebook_login notebook_login()
from transformers import create_optimizer import tensorflow as tf num_train_steps = len(tf_train_dataset) optimizer, schedule = create_optimizer( init_lr=5e-5, num_warmup_steps=1_000, num_train_steps=num_train_steps, weight_decay_rate=0.01, ) model.compile(optimizer=optimizer) # Entraîner en mixed-precision float16 tf.keras.mixed_precision.set_global_policy("mixed_float16")
from transformers.keras_callbacks import PushToHubCallback callback = PushToHubCallback(output_dir="codeparrot-ds", tokenizer=tokenizer) model.fit(tf_train_dataset, validation_data=tf_eval_dataset, callbacks=[callback])
from transformers import pipeline course_model = TFGPT2LMHeadModel.from_pretrained("huggingface-course/codeparrot-ds") course_tokenizer = AutoTokenizer.from_pretrained("huggingface-course/codeparrot-ds") pipe = pipeline( "text-generation", model=course_model, tokenizer=course_tokenizer, device=0 )
txt = """\ # create some data x = np.random.randn(100) y = np.random.randn(100) # create scatter plot with x, y """ print(pipe(txt, num_return_sequences=1)[0]["generated_text"])
txt = """\ # create some data x = np.random.randn(100) y = np.random.randn(100) # create dataframe from x and y """ print(pipe(txt, num_return_sequences=1)[0]["generated_text"])
txt = """\ # dataframe with profession, income and name df = pd.DataFrame({'profession': x, 'income':y, 'name': z}) # calculate the mean income per profession """ print(pipe(txt, num_return_sequences=1)[0]["generated_text"])
txt = """ # import random forest regressor from scikit-learn from sklearn.ensemble import RandomForestRegressor # fit random forest model with 300 estimators on X, y: """ print(pipe(txt, num_return_sequences=1)[0]["generated_text"])