Configuration Parsing
Warning:
In UNKNOWN_FILENAME: "auto_map.AutoTokenizer" must be a string
Introduction
We introduce ElbEmbedding, ...
For more technical details, refer to our paper: ...
Model Details
- Base Decoder-only LLM: ...
- Pooling Type: Last EOS Token
- Maximum context length: 512
- Embedding Dimension: 4096
How to use?
from typing import List
from transformers import AutoTokenizer, AutoModel
import torch
def get_detailed_instruct(queries: List[str]) -> List[str]:
return [f"Instruct: Retrieve semantically similar text.\nQuery: {query}" for query in queries]
def tokenize(sentences: List[str], tokenizer: AutoTokenizer):
texts = [x + tokenizer.eos_token for x in sentences]
inputs = tokenizer(texts, padding=True, truncation=True, return_tensors="pt", max_length=512).to("cuda")
inputs.input_ids[:, -1] = tokenizer.eos_token_id
inputs.pop("token_type_ids", None)
return inputs
def pool(last_hidden_state: torch.Tensor, attention_mask: torch.Tensor, do_normalize: bool = True) -> torch.Tensor:
left_padding = attention_mask[:, -1].sum() == attention_mask.shape[0]
if left_padding:
embeddings = last_hidden_state[:, -1]
else:
sequence_lengths = attention_mask.sum(dim=1) - 1
batch_size = last_hidden_state.shape[0]
embeddings = last_hidden_state[torch.arange(batch_size, device=last_hidden_state.device).long(), sequence_lengths.long()]
if do_normalize:
embeddings = torch.nn.functional.normalize(embeddings, p=2, dim=1)
return embeddings
model = AutoModel.from_pretrained(pretrained_model_name_or_path="lamarr-llm-development/elbembedding", trust_remote_code=True, token=xxx)
tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path="lamarr-llm-development/elbembedding", trust_remote_code=True, token=xxx)
model = model.to("cuda")
queries = [
"Wer war der erste Bundeskanzler der Bundesrepublik Deutschland?",
"Welche deutsche Stadt ist für ihre Bratwürste bekannt?"
]
queries = get_detailed_instruct(queries)
queries_inputs = tokenize(sentences=queries, tokenizer=tokenizer)
queries_outputs = model(**queries_inputs)
queries_embs = pool(last_hidden_state=queries_outputs.last_hidden_state, attention_mask=queries_inputs.attention_mask)
passages = [
"Konrad Adenauer (geboren am 5. Januar 1876 in Köln; gestorben am 19. April 1967 in Rhöndorf) war ein deutscher Politiker und der erste Bundeskanzler der Bundesrepublik Deutschland von 1949 bis 1963. Er war einer der Gründerväter der Bundesrepublik von Deutschland und spielte eine Schlüsselrolle beim Wiederaufbau nach dem Zweiten Weltkrieg.",
"Nürnberg ist eine Stadt im deutschen Bundesland Bayern. Es ist bekannt für seine historische Altstadt, mittelalterliche Befestigungsanlagen und seinen jährlichen Weihnachtsmarkt. Nürnberg ist auch für seine Bratwurst bekannt, eine Wurstsorte, die in Deutschland ein beliebtes Streetfood ist."
]
passages_inputs = tokenize(sentences=passages, tokenizer=tokenizer)
passages_outputs = model(**passages_inputs)
passages_embs = pool(last_hidden_state=passages_outputs.last_hidden_state, attention_mask=passages_inputs.attention_mask)
scores = (queries_embs @ passages_embs.T) * 100
print(scores.tolist())
Supported Languages
...
MTEB Benchmark Evaluation
...
FAQ
Do I need to add instructions to the query?
Yes, this is how the model is trained, otherwise you will see a performance degradation. On the other hand, there is no need to add instructions to the document side.
Citation
...
Limitations
...
- Downloads last month
- 27
This model does not have enough activity to be deployed to Inference API (serverless) yet. Increase its social
visibility and check back later, or deploy to Inference Endpoints (dedicated)
instead.