# Requires transformers>=4.51.0 import torch import torch.nn.functional as F from torch import Tensor from modelscope import AutoTokenizer, AutoModel import datetime import dashscope from http import HTTPStatus dashscope.api_key = 'sk-f8413fcfa63a40e49dbb8ac1d0239f6d' def embed_with_str(input): retry = 0 max_retry = 5 t = 0.2 while retry < max_retry: # time.sleep(t) #阿里接口限流 resp = dashscope.TextEmbedding.call( model=dashscope.TextEmbedding.Models.text_embedding_v2, input=input) if resp.status_code == HTTPStatus.OK: return resp elif resp.status_code == 429: logger.info(f'触发限流,等待{t}秒后重试') retry += 1 t+=0.1 else: logger.error(f'请求失败,状态码:{resp.status_code}') return None logger.error('重试超过上限') return None def last_token_pool(last_hidden_states: Tensor, attention_mask: Tensor) -> Tensor: left_padding = (attention_mask[:, -1].sum() == attention_mask.shape[0]) if left_padding: return last_hidden_states[:, -1] else: sequence_lengths = attention_mask.sum(dim=1) - 1 batch_size = last_hidden_states.shape[0] return last_hidden_states[torch.arange(batch_size, device=last_hidden_states.device), sequence_lengths] def get_detailed_instruct(task_description: str, query: str) -> str: return f'Instruct: {task_description}\nQuery:{query}' # Each query must come with a one-sentence instruction that describes the task task = 'Given a web search query, retrieve relevant passages that answer the query' queries = [ get_detailed_instruct(task, 'What is the capital of China?'), get_detailed_instruct(task, 'Explain gravity') ] # No need to add instruction for retrieval documents documents = [ "The capital of China is Beijing.", "Gravity is a force that attracts two bodies towards each other. It gives weight to physical objects and is responsible for the movement of planets around the sun." ] input_texts = queries + documents tokenizer = AutoTokenizer.from_pretrained('Qwen/Qwen3-Embedding-0.6B', padding_side='left') model = AutoModel.from_pretrained('Qwen/Qwen3-Embedding-0.6B') # We recommend enabling flash_attention_2 for better acceleration and memory saving. # model = AutoModel.from_pretrained('Qwen/Qwen3-Embedding-0.6B', attn_implementation="flash_attention_2", torch_dtype=torch.float16).cuda() print(datetime.datetime.now()) max_length = 8192 # Tokenize the input texts batch_dict = tokenizer( input_texts, padding=True, truncation=True, max_length=max_length, return_tensors="pt", ) batch_dict.to(model.device) outputs = model(**batch_dict) embeddings = last_token_pool(outputs.last_hidden_state, batch_dict['attention_mask']) # normalize embeddings embeddings = F.normalize(embeddings, p=2, dim=1) print(f"=========embeddings=========") print(datetime.datetime.now()) scores = (embeddings[:2] @ embeddings[2:].T) print(len(embeddings.tolist()[0])) # [[0.7645568251609802, 0.14142508804798126], [0.13549736142158508, 0.5999549627304077]] vector_obj = embed_with_str(input_texts) vector = vector_obj.output["embeddings"][0]["embedding"] print(len(vector))