36
36
![ structure] ( ./docs/source/_static/structure.png )
37
37
38
38
** Open-retrievals** unify text embedding, retrieval, reranking and RAG. It's easy, flexible and scalable.
39
- - Embedding fine-tuned through point-wise, pairwise, listwise, contrastive learning, and LLM.
40
- - Reranking fine-tuned with Cross Encoder, ColBERT, and LLM.
41
- - Easily build enhanced modular RAG, integrated with Transformers, Langchain, and LlamaIndex.
39
+ - Embedding fine-tuned through point-wise, pairwise, listwise, contrastive learning and LLM.
40
+ - Reranking fine-tuned with Cross- Encoder, ColBERT and LLM.
41
+ - Easily build enhanced modular RAG, integrated with Transformers, Langchain and LlamaIndex.
42
42
43
43
| Experiment | Model | Original | Finetuned | Demo |
44
44
| -------------------------------| ------------------------| ----------| -----------| ---------------------------------------------------------------------------------------------------------------------------------------------------------------------|
48
48
| ** rerank** colbert | bge-m3 | 0.657 | ** 0.695** | [ ![ Open In Colab] ( https://colab.research.google.com/assets/colab-badge.svg )] ( https://colab.research.google.com/drive/1QVtqhQ080ZMltXoJyODMmvEQYI6oo5kO?usp=sharing ) |
49
49
| ** rerank** LLM (LoRA) | bge-reranker-v2-gemma | 0.637 | ** 0.706** | [ ![ Open In Colab] ( https://colab.research.google.com/assets/colab-badge.svg )] ( https://colab.research.google.com/drive/1fzq1iV7-f8hNKFnjMmpVhVxadqPb9IXk?usp=sharing ) |
50
50
51
- * The metrics is MAP in 10% eval [ t2-reranking data] ( https://huggingface.co/datasets/C-MTEB/T2Reranking ) .
51
+ * The eval metrics is MAP in 10% [ t2-reranking data] ( https://huggingface.co/datasets/C-MTEB/T2Reranking ) .
52
52
* Read [ more examples] ( ./examples )
53
53
54
54
@@ -76,7 +76,7 @@ python -m pip install -U git+https://github.com/LongxingTan/open-retrievals.git
76
76
77
77
[ ![ Open In Colab] ( https://colab.research.google.com/assets/colab-badge.svg )] ( https://colab.research.google.com/drive/1-WBMisdWLeHUKlzJ2DrREXY_kSV8vjP3?usp=sharing )
78
78
79
- <details ><summary > Embeddings from pretrained weights </summary >
79
+ <details ><summary > Embedding from pretrained weights </summary >
80
80
81
81
``` python
82
82
from retrievals import AutoModelForEmbedding
@@ -89,7 +89,7 @@ sentences = [
89
89
]
90
90
model_name_or_path = ' intfloat/e5-base-v2'
91
91
model = AutoModelForEmbedding.from_pretrained(model_name_or_path, pooling_method = " mean" )
92
- embeddings = model.encode(sentences, normalize_embeddings = True , convert_to_tensor = True )
92
+ embeddings = model.encode(sentences, normalize_embeddings = True )
93
93
scores = (embeddings[:2 ] @ embeddings[2 :].T) * 100
94
94
print (scores.tolist())
95
95
```
@@ -103,7 +103,7 @@ from retrievals import AutoModelForEmbedding, AutoModelForRetrieval
103
103
sentences = [' A dog is chasing car.' , ' A man is playing a guitar.' ]
104
104
model_name_or_path = " sentence-transformers/all-MiniLM-L6-v2"
105
105
index_path = ' ./database/faiss/faiss.index'
106
- model = AutoModelForEmbedding.from_pretrained(model_name_or_path)
106
+ model = AutoModelForEmbedding.from_pretrained(model_name_or_path, pooling_method = ' mean ' )
107
107
model.build_index(sentences, index_path = index_path)
108
108
109
109
query_embed = model.encode(" He plays guitar." )
@@ -216,7 +216,7 @@ epochs: int = 3
216
216
train_dataset = load_dataset(' shibing624/nli_zh' , ' STS-B' )[' train' ]
217
217
train_dataset = train_dataset.rename_columns({' sentence1' : ' query' , ' sentence2' : ' positive' })
218
218
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, use_fast = False )
219
- model = AutoModelForEmbedding.from_pretrained(model_name_or_path, pooling_method = " cls " )
219
+ model = AutoModelForEmbedding.from_pretrained(model_name_or_path, pooling_method = " mean " )
220
220
model = model.set_train_type(' pairwise' )
221
221
222
222
optimizer = AdamW(model.parameters(), lr = 5e-5 )
@@ -252,14 +252,22 @@ import torch.nn as nn
252
252
from datasets import load_dataset
253
253
from transformers import AutoTokenizer, AdamW, get_linear_schedule_with_warmup, TrainingArguments
254
254
from retrievals import AutoModelForEmbedding, RetrievalTrainer, PairCollator, TripletCollator
255
- from retrievals.losses import ArcFaceAdaptiveMarginLoss, InfoNCE, SimCSE, TripletLoss
255
+ from retrievals.losses import InfoNCE, SimCSE, TripletLoss
256
+
257
+ def add_instructions (example ):
258
+ example[' query' ] = query_instruction + example[' query' ]
259
+ example[' positive' ] = document_instruction + example[' positive' ]
260
+ return example
256
261
257
262
model_name_or_path: str = " Qwen/Qwen2-1.5B-Instruct"
258
263
batch_size: int = 8
259
264
epochs: int = 3
265
+ query_instruction = " Retrieve relevant passages that answer the query\n Query: "
266
+ document_instruction = " Document: "
260
267
261
268
train_dataset = load_dataset(' shibing624/nli_zh' , ' STS-B' )[' train' ]
262
269
train_dataset = train_dataset.rename_columns({' sentence1' : ' query' , ' sentence2' : ' positive' })
270
+ train_dataset = train_dataset.map(add_instructions)
263
271
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, use_fast = False )
264
272
model = AutoModelForEmbedding.from_pretrained(model_name_or_path, pooling_method = " last" , use_lora = True )
265
273
model = model.set_train_type(' pairwise' , loss_fn = InfoNCE(nn.CrossEntropyLoss(label_smoothing = 0.05 )))
@@ -272,6 +280,7 @@ training_arguments = TrainingArguments(
272
280
num_train_epochs = epochs,
273
281
per_device_train_batch_size = batch_size,
274
282
remove_unused_columns = False ,
283
+ logging_steps = 100 ,
275
284
)
276
285
trainer = RetrievalTrainer(
277
286
model = model,
@@ -291,25 +300,32 @@ trainer.train()
291
300
from transformers import AutoTokenizer, TrainingArguments, get_cosine_schedule_with_warmup, AdamW
292
301
from retrievals import RerankCollator, AutoModelForRanking, RerankTrainer, RerankTrainDataset
293
302
294
- model_name_or_path: str = " microsoft/deberta-v3 -base"
303
+ model_name_or_path: str = " BAAI/bge-reranker -base"
295
304
max_length: int = 128
296
305
learning_rate: float = 3e-5
297
306
batch_size: int = 4
298
307
epochs: int = 3
308
+ output_dir: str = " ./checkpoints"
299
309
300
- train_dataset = RerankTrainDataset(' ./t2rank.json ' , positive_key = ' pos ' , negative_key = ' neg ' )
310
+ train_dataset = RerankTrainDataset(" C-MTEB/T2Reranking " , positive_key = " positive " , negative_key = " negative " , dataset_split = ' dev ' )
301
311
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, use_fast = False )
302
312
model = AutoModelForRanking.from_pretrained(model_name_or_path)
303
313
optimizer = AdamW(model.parameters(), lr = learning_rate)
304
314
num_train_steps = int (len (train_dataset) / batch_size * epochs)
305
- scheduler = get_cosine_schedule_with_warmup(optimizer, num_warmup_steps = 0.05 * num_train_steps, num_training_steps = num_train_steps)
315
+ scheduler = get_cosine_schedule_with_warmup(
316
+ optimizer,
317
+ num_warmup_steps = 0.05 * num_train_steps,
318
+ num_training_steps = num_train_steps,
319
+ )
306
320
307
321
training_args = TrainingArguments(
308
322
learning_rate = learning_rate,
309
323
per_device_train_batch_size = batch_size,
310
324
num_train_epochs = epochs,
311
- output_dir = ' ./checkpoints ' ,
325
+ output_dir = output_dir ,
312
326
remove_unused_columns = False ,
327
+ logging_steps = 100 ,
328
+ report_to = " none" ,
313
329
)
314
330
trainer = RerankTrainer(
315
331
model = model,
@@ -348,9 +364,7 @@ epochs: int = 3
348
364
colbert_dim: int = 1024
349
365
output_dir: str = ' ./checkpoints'
350
366
351
- train_dataset = RetrievalTrainDataset(
352
- ' C-MTEB/T2Reranking' , positive_key = ' positive' , negative_key = ' negative' , dataset_split = ' dev'
353
- )
367
+ train_dataset = RetrievalTrainDataset(' C-MTEB/T2Reranking' , positive_key = ' positive' , negative_key = ' negative' , dataset_split = ' dev' )
354
368
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, use_fast = False )
355
369
data_collator = ColBertCollator(
356
370
tokenizer,
@@ -367,9 +381,7 @@ model = ColBERT.from_pretrained(
367
381
368
382
optimizer = AdamW(model.parameters(), lr = learning_rate)
369
383
num_train_steps = int (len (train_dataset) / batch_size * epochs)
370
- scheduler = get_cosine_schedule_with_warmup(
371
- optimizer, num_warmup_steps = 0.05 * num_train_steps, num_training_steps = num_train_steps
372
- )
384
+ scheduler = get_cosine_schedule_with_warmup(optimizer, num_warmup_steps = 0.05 * num_train_steps, num_training_steps = num_train_steps)
373
385
374
386
training_args = TrainingArguments(
375
387
learning_rate = learning_rate,
@@ -394,7 +406,74 @@ trainer.train()
394
406
<details ><summary > Fine-tune LLM reranking </summary >
395
407
396
408
``` python
409
+ from transformers import (
410
+ AdamW,
411
+ AutoTokenizer,
412
+ TrainingArguments,
413
+ get_cosine_schedule_with_warmup,
414
+ )
397
415
416
+ from retrievals import (
417
+ LLMRanker,
418
+ LLMRerankCollator,
419
+ RerankTrainer,
420
+ RetrievalTrainDataset,
421
+ )
422
+ from retrievals.losses import TokenLoss
423
+
424
+ model_name_or_path: str = " Qwen/Qwen2-1.5B-Instruct"
425
+ max_length: int = 512
426
+ learning_rate: float = 3e-5
427
+ batch_size: int = 8
428
+ epochs: int = 3
429
+ task_prompt: str = (
430
+ """ Given a query A and a passage B, determine whether the passage contains an answer to the query"""
431
+ """ by providing a prediction of either 'Yes' or 'No'."""
432
+ )
433
+
434
+ tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, use_fast = False )
435
+ train_dataset = RetrievalTrainDataset(
436
+ data_name_or_path = ' C-MTEB/T2Reranking' ,
437
+ positive_key = ' positive' ,
438
+ negative_key = ' negative' ,
439
+ query_instruction = ' A: ' ,
440
+ document_instruction = ' B: ' ,
441
+ dataset_split = ' dev' ,
442
+ )
443
+ data_collator = LLMRerankCollator(tokenizer = tokenizer, max_length = max_length, prompt = task_prompt, add_target_token = ' Yes' )
444
+ token_index = tokenizer(' Yes' , add_special_tokens = False )[' input_ids' ][- 1 ]
445
+ model = LLMRanker.from_pretrained(
446
+ model_name_or_path,
447
+ causal_lm = True ,
448
+ use_fp16 = True ,
449
+ loss_fn = TokenLoss(token_index = token_index),
450
+ use_lora = True ,
451
+ )
452
+
453
+ optimizer = AdamW(model.parameters(), lr = learning_rate)
454
+ num_train_steps = int (len (train_dataset) / batch_size * epochs)
455
+ scheduler = get_cosine_schedule_with_warmup(
456
+ optimizer,
457
+ num_warmup_steps = 0.05 * num_train_steps,
458
+ num_training_steps = num_train_steps,
459
+ )
460
+
461
+ training_args = TrainingArguments(
462
+ learning_rate = learning_rate,
463
+ per_device_train_batch_size = batch_size,
464
+ num_train_epochs = epochs,
465
+ output_dir = " ./checkpoints" ,
466
+ remove_unused_columns = False ,
467
+ )
468
+ trainer = RerankTrainer(
469
+ model = model,
470
+ args = training_args,
471
+ train_dataset = train_dataset,
472
+ data_collator = data_collator,
473
+ )
474
+ trainer.optimizer = optimizer
475
+ trainer.scheduler = scheduler
476
+ trainer.train()
398
477
```
399
478
</details >
400
479
0 commit comments