CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutSign UpSign In
huggingface

Real-time collaboration for Jupyter Notebooks, Linux Terminals, LaTeX, VS Code, R IDE, and more,
all in one place. Commercial Alternative to JupyterHub.

GitHub Repository: huggingface/notebooks
Path: blob/main/sagemaker/32_train_deploy_embedding_models/scripts/run_mnr.py
Views: 2555
1
from dataclasses import dataclass, field
2
import os
3
from sentence_transformers import (
4
SentenceTransformerModelCardData,
5
SentenceTransformer,
6
SentenceTransformerTrainer,
7
SentenceTransformerTrainingArguments,
8
)
9
from sentence_transformers.losses import MatryoshkaLoss, MultipleNegativesRankingLoss
10
from sentence_transformers.training_args import BatchSamplers
11
from transformers import set_seed, HfArgumentParser
12
13
14
from sentence_transformers.evaluation import (
15
InformationRetrievalEvaluator,
16
SequentialEvaluator,
17
)
18
from sentence_transformers.util import cos_sim
19
from datasets import load_dataset, concatenate_datasets
20
21
22
@dataclass
23
class ScriptArguments:
24
train_dataset_path: str = field(
25
default="/opt/ml/input/data/train/",
26
metadata={"help": "Path to the dataset, e.g. /opt/ml/input/data/train/"},
27
)
28
test_dataset_path: str = field(
29
default="/opt/ml/input/data/test/",
30
metadata={"help": "Path to the dataset, e.g. /opt/ml/input/data/test/"},
31
)
32
model_id: str = field(
33
default=None, metadata={"help": "Model ID to use for Embedding training"}
34
)
35
num_train_epochs: int = field(
36
default=1, metadata={"help": "Number of training epochs"}
37
)
38
per_device_train_batch_size: int = field(
39
default=32, metadata={"help": "Training batch size"}
40
)
41
per_device_eval_batch_size: int = field(
42
default=16, metadata={"help": "Evaluation batch size"}
43
)
44
gradient_accumulation_steps: int = field(
45
default=16, metadata={"help": "Gradient accumulation steps"}
46
)
47
learning_rate: float = field(
48
default=2e-5, metadata={"help": "Learning rate for the optimizer"}
49
)
50
51
52
def create_evaluator(
53
train_dataset, test_dataset, matryoshka_dimensions=[768, 512, 256, 128, 64]
54
):
55
corpus_dataset = concatenate_datasets([train_dataset, test_dataset])
56
57
# Convert the datasets to dictionaries
58
corpus = dict(
59
zip(corpus_dataset["id"], corpus_dataset["positive"])
60
) # Our corpus (cid => document)
61
queries = dict(
62
zip(test_dataset["id"], test_dataset["anchor"])
63
) # Our queries (qid => question)
64
65
# Create a mapping of relevant document (1 in our case) for each query
66
relevant_docs = {} # Query ID to relevant documents (qid => set([relevant_cids])
67
for q_id in queries:
68
relevant_docs[q_id] = [q_id]
69
70
matryoshka_evaluators = []
71
# Iterate over the different dimensions
72
for dim in matryoshka_dimensions:
73
ir_evaluator = InformationRetrievalEvaluator(
74
queries=queries,
75
corpus=corpus,
76
relevant_docs=relevant_docs,
77
name=f"dim_{dim}",
78
truncate_dim=dim, # Truncate the embeddings to a certain dimension
79
score_functions={"cosine": cos_sim},
80
)
81
matryoshka_evaluators.append(ir_evaluator)
82
83
# Create a sequential evaluator
84
return SequentialEvaluator(matryoshka_evaluators)
85
86
87
def training_function(script_args):
88
################
89
# Dataset
90
################
91
92
train_dataset = load_dataset(
93
"json",
94
data_files=os.path.join(script_args.train_dataset_path, "dataset.json"),
95
split="train",
96
)
97
test_dataset = load_dataset(
98
"json",
99
data_files=os.path.join(script_args.test_dataset_path, "dataset.json"),
100
split="train",
101
)
102
103
###################
104
# Model & Evaluator
105
###################
106
107
matryoshka_dimensions = [768, 512, 256, 128, 64] # Important: large to small
108
109
model = SentenceTransformer(
110
script_args.model_id,
111
device="cuda",
112
model_kwargs={"attn_implementation": "sdpa"}, # needs Ampere GPU or newer
113
model_card_data=SentenceTransformerModelCardData(
114
language="en",
115
license="apache-2.0",
116
model_name="BGE base Financial Matryoshka",
117
),
118
)
119
evaluator = create_evaluator(
120
train_dataset, test_dataset, matryoshka_dimensions=matryoshka_dimensions
121
)
122
123
###################
124
# Loss Function
125
###################
126
127
# create Matryoshka loss function with MultipleNegativesRankingLoss
128
inner_train_loss = MultipleNegativesRankingLoss(model)
129
train_loss = MatryoshkaLoss(
130
model, inner_train_loss, matryoshka_dims=matryoshka_dimensions
131
)
132
133
################
134
# Training
135
################
136
training_args = SentenceTransformerTrainingArguments(
137
output_dir="/opt/ml/model", # output directory for sagemaker to upload to s3
138
num_train_epochs=script_args.num_train_epochs, # number of epochs
139
per_device_train_batch_size=script_args.per_device_train_batch_size, # training batch size
140
per_device_eval_batch_size=script_args.per_device_eval_batch_size, # evaluation batch size
141
gradient_accumulation_steps=script_args.gradient_accumulation_steps, # gradient accumulation steps
142
warmup_ratio=0.1, # warmup ratio
143
learning_rate=script_args.learning_rate, # learning rate
144
lr_scheduler_type="cosine", # use constant learning rate scheduler
145
optim="adamw_torch_fused", # use fused adamw optimizer
146
tf32=True, # use tf32 precision
147
bf16=True, # use bf16 precision
148
batch_sampler=BatchSamplers.NO_DUPLICATES, # MultipleNegativesRankingLoss benefits from no duplicate samples in a batch
149
eval_strategy="epoch", # evaluate after each epoch
150
save_strategy="epoch", # save after each epoch
151
logging_steps=10, # log every 10 steps
152
save_total_limit=3, # save only the last 3 models
153
load_best_model_at_end=True, # load the best model when training ends
154
metric_for_best_model="eval_dim_128_cosine_ndcg@10", # Optimizing for the best ndcg@10 score for the 128 dimension
155
)
156
157
trainer = SentenceTransformerTrainer(
158
model=model, # bg-base-en-v1
159
args=training_args, # training arguments
160
train_dataset=train_dataset.select_columns(
161
["positive", "anchor"]
162
), # training dataset
163
loss=train_loss,
164
evaluator=evaluator,
165
)
166
167
##########################
168
# Train model
169
##########################
170
# start training, the model will be automatically saved to the hub and the output directory
171
trainer.train()
172
173
# save the best model
174
trainer.save_model()
175
176
177
if __name__ == "__main__":
178
parser = HfArgumentParser((ScriptArguments))
179
script_args = parser.parse_args_into_dataclasses()[0]
180
181
# set seed
182
set_seed(42)
183
184
# launch training
185
training_function(script_args)
186
187