# Requires transformers>=4.51.0

import torch
import torch.nn.functional as F

from torch import Tensor
from modelscope import AutoTokenizer, AutoModel
import datetime
import dashscope
from http import HTTPStatus


dashscope.api_key = 'sk-f8413fcfa63a40e49dbb8ac1d0239f6d'

def embed_with_str(input):
    retry = 0
    max_retry = 5
    t = 0.2
    while retry < max_retry:
        # time.sleep(t)
        #阿里接口限流 
        resp = dashscope.TextEmbedding.call(
            model=dashscope.TextEmbedding.Models.text_embedding_v2,
            input=input)
        if resp.status_code == HTTPStatus.OK:
            return resp
        elif resp.status_code == 429:
            logger.info(f'触发限流,等待{t}秒后重试')
            retry += 1
            t+=0.1
        else:
            logger.error(f'请求失败,状态码:{resp.status_code}')
            return None
    logger.error('重试超过上限')
    return None

def last_token_pool(last_hidden_states: Tensor,
                 attention_mask: Tensor) -> Tensor:
    left_padding = (attention_mask[:, -1].sum() == attention_mask.shape[0])
    if left_padding:
        return last_hidden_states[:, -1]
    else:
        sequence_lengths = attention_mask.sum(dim=1) - 1
        batch_size = last_hidden_states.shape[0]
        return last_hidden_states[torch.arange(batch_size, device=last_hidden_states.device), sequence_lengths]


def get_detailed_instruct(task_description: str, query: str) -> str:
    return f'Instruct: {task_description}\nQuery:{query}'

# Each query must come with a one-sentence instruction that describes the task
task = 'Given a web search query, retrieve relevant passages that answer the query'

queries = [
    get_detailed_instruct(task, 'What is the capital of China?'),
    get_detailed_instruct(task, 'Explain gravity')
]
# No need to add instruction for retrieval documents
documents = [
    "The capital of China is Beijing.",
    "Gravity is a force that attracts two bodies towards each other. It gives weight to physical objects and is responsible for the movement of planets around the sun."
]
input_texts = queries + documents

tokenizer = AutoTokenizer.from_pretrained('Qwen/Qwen3-Embedding-0.6B', padding_side='left')
model = AutoModel.from_pretrained('Qwen/Qwen3-Embedding-0.6B')

# We recommend enabling flash_attention_2 for better acceleration and memory saving.
# model = AutoModel.from_pretrained('Qwen/Qwen3-Embedding-0.6B', attn_implementation="flash_attention_2", torch_dtype=torch.float16).cuda()
print(datetime.datetime.now())
max_length = 8192

# Tokenize the input texts
batch_dict = tokenizer(
    input_texts,
    padding=True,
    truncation=True,
    max_length=max_length,
    return_tensors="pt",
)
batch_dict.to(model.device)
outputs = model(**batch_dict)


embeddings = last_token_pool(outputs.last_hidden_state, batch_dict['attention_mask'])

# normalize embeddings
embeddings = F.normalize(embeddings, p=2, dim=1)
print(f"=========embeddings=========")
print(datetime.datetime.now())

scores = (embeddings[:2] @ embeddings[2:].T)
print(len(embeddings.tolist()[0]))
# [[0.7645568251609802, 0.14142508804798126], [0.13549736142158508, 0.5999549627304077]]


vector_obj = embed_with_str(input_texts)          
vector = vector_obj.output["embeddings"][0]["embedding"]
print(len(vector))