pdf_code/zzb_data_prod/Embedding_test.py

100 lines
3.2 KiB
Python

# Requires transformers>=4.51.0
import torch
import torch.nn.functional as F
from torch import Tensor
from modelscope import AutoTokenizer, AutoModel
import datetime
import dashscope
from http import HTTPStatus
dashscope.api_key = 'sk-f8413fcfa63a40e49dbb8ac1d0239f6d'
def embed_with_str(input):
retry = 0
max_retry = 5
t = 0.2
while retry < max_retry:
# time.sleep(t)
#阿里接口限流
resp = dashscope.TextEmbedding.call(
model=dashscope.TextEmbedding.Models.text_embedding_v2,
input=input)
if resp.status_code == HTTPStatus.OK:
return resp
elif resp.status_code == 429:
logger.info(f'触发限流,等待{t}秒后重试')
retry += 1
t+=0.1
else:
logger.error(f'请求失败,状态码:{resp.status_code}')
return None
logger.error('重试超过上限')
return None
def last_token_pool(last_hidden_states: Tensor,
attention_mask: Tensor) -> Tensor:
left_padding = (attention_mask[:, -1].sum() == attention_mask.shape[0])
if left_padding:
return last_hidden_states[:, -1]
else:
sequence_lengths = attention_mask.sum(dim=1) - 1
batch_size = last_hidden_states.shape[0]
return last_hidden_states[torch.arange(batch_size, device=last_hidden_states.device), sequence_lengths]
def get_detailed_instruct(task_description: str, query: str) -> str:
return f'Instruct: {task_description}\nQuery:{query}'
# Each query must come with a one-sentence instruction that describes the task
task = 'Given a web search query, retrieve relevant passages that answer the query'
queries = [
get_detailed_instruct(task, 'What is the capital of China?'),
get_detailed_instruct(task, 'Explain gravity')
]
# No need to add instruction for retrieval documents
documents = [
"The capital of China is Beijing.",
"Gravity is a force that attracts two bodies towards each other. It gives weight to physical objects and is responsible for the movement of planets around the sun."
]
input_texts = queries + documents
tokenizer = AutoTokenizer.from_pretrained('Qwen/Qwen3-Embedding-0.6B', padding_side='left')
model = AutoModel.from_pretrained('Qwen/Qwen3-Embedding-0.6B')
# We recommend enabling flash_attention_2 for better acceleration and memory saving.
# model = AutoModel.from_pretrained('Qwen/Qwen3-Embedding-0.6B', attn_implementation="flash_attention_2", torch_dtype=torch.float16).cuda()
print(datetime.datetime.now())
max_length = 8192
# Tokenize the input texts
batch_dict = tokenizer(
input_texts,
padding=True,
truncation=True,
max_length=max_length,
return_tensors="pt",
)
batch_dict.to(model.device)
outputs = model(**batch_dict)
embeddings = last_token_pool(outputs.last_hidden_state, batch_dict['attention_mask'])
# normalize embeddings
embeddings = F.normalize(embeddings, p=2, dim=1)
print(f"=========embeddings=========")
print(datetime.datetime.now())
scores = (embeddings[:2] @ embeddings[2:].T)
print(len(embeddings.tolist()[0]))
# [[0.7645568251609802, 0.14142508804798126], [0.13549736142158508, 0.5999549627304077]]
vector_obj = embed_with_str(input_texts)
vector = vector_obj.output["embeddings"][0]["embedding"]
print(len(vector))