2025-06-11 10:49:55 +08:00
|
|
|
|
import pika
|
|
|
|
|
|
import json
|
|
|
|
|
|
import logging
|
|
|
|
|
|
import time
|
|
|
|
|
|
import os
|
2026-03-05 17:34:29 +08:00
|
|
|
|
import re
|
2025-06-11 10:49:55 +08:00
|
|
|
|
import threading
|
2026-03-05 17:34:29 +08:00
|
|
|
|
from datetime import datetime
|
2025-06-11 10:49:55 +08:00
|
|
|
|
from concurrent.futures import ThreadPoolExecutor
|
|
|
|
|
|
from queue import Queue
|
|
|
|
|
|
from config import *
|
2026-03-05 17:34:29 +08:00
|
|
|
|
import jieba
|
|
|
|
|
|
from llm_process import send_mq, get_label, get_translation, is_mostly_chinese, get_tdx_map, get_stock_map
|
|
|
|
|
|
from label_check import load_label_mapping, validate_tags
|
|
|
|
|
|
from check_sensitivity import SensitiveWordFilter
|
|
|
|
|
|
from news_clustering import NewsClusterer
|
|
|
|
|
|
from cal_etf_labels import *
|
|
|
|
|
|
import pandas as pd
|
|
|
|
|
|
|
|
|
|
|
|
import threading
|
|
|
|
|
|
import time
|
|
|
|
|
|
from datetime import datetime
|
|
|
|
|
|
from apscheduler.schedulers.background import BackgroundScheduler
|
2025-06-11 10:49:55 +08:00
|
|
|
|
|
|
|
|
|
|
# 声明一个全局变量,存媒体的权威度打分
|
|
|
|
|
|
media_score = {}
|
|
|
|
|
|
with open("media_score.txt", "r", encoding="utf-8") as f:
|
|
|
|
|
|
for line in f:
|
|
|
|
|
|
line = line.strip()
|
|
|
|
|
|
if not line:
|
|
|
|
|
|
continue
|
|
|
|
|
|
try:
|
|
|
|
|
|
media, score = line.split("\t")
|
|
|
|
|
|
media_score[media.strip()] = int(score)
|
|
|
|
|
|
except ValueError as e:
|
|
|
|
|
|
print(f"解析错误: {e},行内容: {line}")
|
|
|
|
|
|
continue
|
|
|
|
|
|
|
2026-03-05 17:34:29 +08:00
|
|
|
|
# 行业及概念的一二级标签强制检查
|
|
|
|
|
|
concept_mapping = load_label_mapping("concept_mapping.txt")
|
|
|
|
|
|
industry_mapping = load_label_mapping("industry_mapping.txt")
|
|
|
|
|
|
|
|
|
|
|
|
# 敏感词检查
|
|
|
|
|
|
sensitive_filter = SensitiveWordFilter("sensitive_words.txt")
|
|
|
|
|
|
|
|
|
|
|
|
# 聚类打分
|
|
|
|
|
|
logging.getLogger("pika").setLevel(logging.WARNING)
|
|
|
|
|
|
JIEBA_DICT_PATH = '/root/zzck/news_distinct_task/dict.txt.big' # jieba分词字典路径
|
|
|
|
|
|
if os.path.exists(JIEBA_DICT_PATH):
|
|
|
|
|
|
jieba.set_dictionary(JIEBA_DICT_PATH)
|
|
|
|
|
|
news_cluster = NewsClusterer()
|
|
|
|
|
|
news_cluster.load_clusters()
|
|
|
|
|
|
|
|
|
|
|
|
# 关键词加分
|
|
|
|
|
|
words_bonus = {}
|
|
|
|
|
|
with open("bonus_words.txt", "r", encoding="utf-8") as f:
|
|
|
|
|
|
for line in f:
|
|
|
|
|
|
line = line.strip()
|
|
|
|
|
|
if not line:
|
|
|
|
|
|
continue
|
|
|
|
|
|
try:
|
|
|
|
|
|
word, bonus = line.split(",")
|
|
|
|
|
|
words_bonus[word.strip()] = int(bonus)
|
|
|
|
|
|
except ValueError as e:
|
|
|
|
|
|
print(f"关键词加分项解析错误: {e},行内容: {line}")
|
|
|
|
|
|
continue
|
|
|
|
|
|
print(words_bonus)
|
|
|
|
|
|
|
|
|
|
|
|
# 申万一级行业热度分, 从csv读入dataframe后直接转dict
|
|
|
|
|
|
# --- 1. 初始化 ---
|
|
|
|
|
|
sw_heat_file = "/root/zzck/industry_heat_task/sw_data/heat_ranking_newest_5d.csv"
|
|
|
|
|
|
def load_sw_map():
|
|
|
|
|
|
"""封装读取逻辑,增加异常处理防止程序因为文件错误崩溃"""
|
|
|
|
|
|
try:
|
|
|
|
|
|
df = pd.read_csv(sw_heat_file)
|
|
|
|
|
|
# 使用你最优雅的一行逻辑
|
|
|
|
|
|
return df.set_index('name')['heat_score'].to_dict()
|
|
|
|
|
|
except Exception as e:
|
|
|
|
|
|
print(f"读取CSV失败: {e}")
|
|
|
|
|
|
return None
|
|
|
|
|
|
|
|
|
|
|
|
# 全局变量初始化
|
|
|
|
|
|
sw_heat_map = load_sw_map()
|
|
|
|
|
|
|
|
|
|
|
|
# --- 2. 定义更新任务 ---
|
|
|
|
|
|
def update_sw_heat_job():
|
|
|
|
|
|
global sw_heat_map
|
|
|
|
|
|
print(f"[{datetime.now()}] 开始执行每日热度数据更新...")
|
|
|
|
|
|
new_map = load_sw_map()
|
|
|
|
|
|
if new_map:
|
|
|
|
|
|
# 原子替换:这是线程安全的,正在运行的进程会自动切换到新引用
|
|
|
|
|
|
sw_heat_map = new_map
|
|
|
|
|
|
print("sw_heat_map 更新成功")
|
|
|
|
|
|
print(f"当前sw_heat_map示例: {list(sw_heat_map.items())[:5]}") # 打印前5项检查
|
|
|
|
|
|
|
|
|
|
|
|
# --- 3. 启动定时调度器 ---
|
|
|
|
|
|
scheduler = BackgroundScheduler()
|
|
|
|
|
|
# 每天 18:10 执行更新
|
|
|
|
|
|
scheduler.add_job(update_sw_heat_job, 'cron', hour=18, minute=10)
|
|
|
|
|
|
scheduler.start()
|
|
|
|
|
|
|
|
|
|
|
|
|
2025-06-11 10:49:55 +08:00
|
|
|
|
# 幂等性存储 - 记录已处理消息ID (使用线程安全的集合)
|
|
|
|
|
|
processed_ids = set()
|
|
|
|
|
|
processed_ids_lock = threading.Lock() # 用于同步对processed_ids的访问
|
|
|
|
|
|
|
|
|
|
|
|
# 创建消息队列用于批量处理
|
|
|
|
|
|
message_queue = Queue()
|
|
|
|
|
|
BATCH_SIZE = 24 # 每批处理的消息数量
|
|
|
|
|
|
MAX_WORKERS = 24 # 线程池最大工作线程数
|
|
|
|
|
|
MIN_BATCH_SIZE = 12 # 最小批量处理消息数量
|
|
|
|
|
|
PROCESS_INTERVAL = 10 # 处理间隔(秒)
|
|
|
|
|
|
|
|
|
|
|
|
def process_single_message(data):
|
|
|
|
|
|
"""处理单条消息的业务逻辑"""
|
|
|
|
|
|
try:
|
|
|
|
|
|
id_str = str(data["id"])
|
|
|
|
|
|
input_date = data["input_date"]
|
|
|
|
|
|
# print(id_str + "\t" + str(input_date))
|
|
|
|
|
|
|
|
|
|
|
|
# 幂等性检查
|
|
|
|
|
|
with processed_ids_lock:
|
|
|
|
|
|
if id_str in processed_ids:
|
|
|
|
|
|
print(f"跳过已处理的消息: {id_str}")
|
|
|
|
|
|
return None, True # 返回None表示不需要发送,True表示已处理
|
|
|
|
|
|
# 先标记为已处理,防止重复
|
|
|
|
|
|
processed_ids.add(id_str)
|
|
|
|
|
|
if len(processed_ids) > 10000:
|
|
|
|
|
|
processed_ids.clear()
|
|
|
|
|
|
|
|
|
|
|
|
content = data.get('CN_content', "").strip()
|
|
|
|
|
|
source = "其他"
|
|
|
|
|
|
category_data = data.get('c', [{}])
|
|
|
|
|
|
category = ""
|
|
|
|
|
|
if category_data:
|
|
|
|
|
|
category = category_data[0].get('category', '')
|
|
|
|
|
|
b_data = category_data[0].get('b', [{}])
|
|
|
|
|
|
if b_data:
|
|
|
|
|
|
d_data = b_data[0].get('d', [{}])
|
|
|
|
|
|
if d_data:
|
|
|
|
|
|
source = d_data[0].get('sourcename', "其他")
|
|
|
|
|
|
source_impact = media_score.get(source, 5)
|
|
|
|
|
|
tagged_news = get_label(content, source)
|
2026-03-05 17:34:29 +08:00
|
|
|
|
# deal with the label problems
|
|
|
|
|
|
tagged_news = validate_tags(tagged_news, "industry_label", "industry_confidence", industry_mapping)
|
|
|
|
|
|
tagged_news = validate_tags(tagged_news, "concept_label", "concept_confidence", concept_mapping)
|
2025-06-11 10:49:55 +08:00
|
|
|
|
|
|
|
|
|
|
industry_confidence = tagged_news.get("industry_confidence", [])
|
|
|
|
|
|
concept_confidence = tagged_news.get("concept_confidence", [])
|
|
|
|
|
|
|
|
|
|
|
|
industry_confidence = list(map(lambda x: round(x, 2), industry_confidence))
|
|
|
|
|
|
concept_confidence = list(map(lambda x: round(x, 2), concept_confidence))
|
2026-03-05 17:34:29 +08:00
|
|
|
|
|
|
|
|
|
|
industry_label = tagged_news.get("industry_label", [])
|
|
|
|
|
|
sw_heat = 10 # 默认热度分,如果没有标签或热度数据缺失则使用默认值
|
|
|
|
|
|
if industry_label and industry_confidence:
|
|
|
|
|
|
first_industry = industry_label[0].split("-")[0]
|
|
|
|
|
|
first_industry_confidence = industry_confidence[0]
|
|
|
|
|
|
if first_industry_confidence >= 0.75:
|
|
|
|
|
|
sw_heat = sw_heat_map.get(first_industry, 0)
|
|
|
|
|
|
|
|
|
|
|
|
public_opinion_score = tagged_news.get("public_opinion_score", 30) #资讯质量分
|
|
|
|
|
|
China_factor = tagged_news.get("China_factor", 0.2) #中国股市相关度
|
|
|
|
|
|
news_score = source_impact * 0.03 + public_opinion_score * 0.3 + China_factor * 35 + sw_heat * 0.05
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
industry_score = list(map(lambda x: round(x * news_score, 2), industry_confidence))
|
|
|
|
|
|
concept_score = list(map(lambda x: round(x * news_score, 2), concept_confidence))
|
|
|
|
|
|
|
|
|
|
|
|
# 增加一个关键词加分, 例如原文标题命中 Google 加 5 分, 不区分大小写
|
|
|
|
|
|
ori_title = data.get('title_EN', '')
|
|
|
|
|
|
hits = []
|
|
|
|
|
|
for word, bonus in words_bonus.items():
|
|
|
|
|
|
if re.search(rf'\b{re.escape(word)}\b', ori_title, re.IGNORECASE):
|
|
|
|
|
|
hits.append((word, bonus))
|
|
|
|
|
|
if hits:
|
|
|
|
|
|
# 按分数降序排序,取最高分项
|
|
|
|
|
|
hits_sorted = sorted(hits, key=lambda x: x[1], reverse=True)
|
|
|
|
|
|
print(f"原文标题{ori_title} 命中关键词及分数:", hits_sorted)
|
|
|
|
|
|
add_score = hits_sorted[0][-1] * 0.05
|
|
|
|
|
|
print(f"新闻{id_str} 当前分数news_score变动: {news_score} -> {news_score + add_score} ... 如果超出上限后续会调整...")
|
|
|
|
|
|
news_score = min(99.0, news_score + add_score)
|
|
|
|
|
|
else:
|
|
|
|
|
|
pass
|
|
|
|
|
|
|
|
|
|
|
|
# 增加一个聚类打分,目前仅作试验
|
|
|
|
|
|
if datetime.now().minute == 12:
|
|
|
|
|
|
if not news_cluster.load_clusters():
|
|
|
|
|
|
print(f"聚类文件加载失败 {datetime.now()}")
|
|
|
|
|
|
CN_title = data.get('title_txt', '')
|
|
|
|
|
|
cluster_evaluate = news_cluster.evaluate_news(title=CN_title, content=content)
|
|
|
|
|
|
cluster_score = (cluster_evaluate['weighted_score'] ** 0.5) * 10 if cluster_evaluate else 20.0
|
|
|
|
|
|
center_news_id = cluster_evaluate['center_news_id'] if cluster_evaluate else None
|
|
|
|
|
|
news_score_exp = source_impact * 0.04 + cluster_score * 0.25 + China_factor * 35
|
|
|
|
|
|
|
|
|
|
|
|
# if it's Ads, decrease the score
|
|
|
|
|
|
if all(keyword in content for keyword in ["邮箱", "电话", "网站"]):
|
|
|
|
|
|
print("新闻判别为广告软文,分数强制8折")
|
|
|
|
|
|
news_score *= 0.79
|
|
|
|
|
|
news_score_exp *= 0.79
|
2025-06-11 10:49:55 +08:00
|
|
|
|
|
2026-03-05 17:34:29 +08:00
|
|
|
|
# if there is no labels at all, decrease the score
|
|
|
|
|
|
if news_score >= 80 and len(tagged_news.get("industry_label", [])) == 0 and len(tagged_news.get("concept_label", [])) == 0:
|
|
|
|
|
|
print("新闻没有打上任何有效标签,分数强制8折")
|
|
|
|
|
|
news_score *= 0.79
|
|
|
|
|
|
news_score_exp *= 0.79
|
|
|
|
|
|
|
|
|
|
|
|
# if there are sensitive words in title and content, decrease the score
|
|
|
|
|
|
if news_score >= 80:
|
|
|
|
|
|
title = data.get('title_txt', '')
|
|
|
|
|
|
news_text = f"{title} {content}"
|
|
|
|
|
|
result = sensitive_filter.detect(news_text)
|
|
|
|
|
|
if result:
|
|
|
|
|
|
print("新闻命中敏感词: ", ", ".join(list(result)), "; 分数强制8折")
|
|
|
|
|
|
news_score *= 0.79
|
|
|
|
|
|
news_score_exp *= 0.79
|
|
|
|
|
|
|
|
|
|
|
|
# make sure the selected news label confidence is higher than 0.7
|
|
|
|
|
|
if news_score >= 80:
|
|
|
|
|
|
industry_high_confidence = industry_confidence[0] if len(industry_confidence) > 0 else 0
|
|
|
|
|
|
concept_high_confidence = concept_confidence[0] if len(concept_confidence) > 0 else 0
|
|
|
|
|
|
if max(industry_high_confidence, concept_high_confidence) < 0.7:
|
|
|
|
|
|
news_score *= 0.79
|
|
|
|
|
|
news_score_exp *= 0.79
|
|
|
|
|
|
print(f"行业标签置信度为{industry_high_confidence}, 概念标签置信度为{concept_high_confidence}, 新闻得分强制8折.")
|
|
|
|
|
|
news_score = round(news_score, 2)
|
|
|
|
|
|
news_score_exp = round(news_score_exp, 2)
|
|
|
|
|
|
cluster_score = round(cluster_score, 2)
|
|
|
|
|
|
|
|
|
|
|
|
# if score >= 80, add translation by llm
|
|
|
|
|
|
if news_score >= 80:
|
|
|
|
|
|
ori_content = data.get('EN_content', '')
|
|
|
|
|
|
ori_title = data.get('title_EN', '')
|
|
|
|
|
|
for _ in range(3):
|
|
|
|
|
|
# 翻译功能存在不稳定, 至多重试三次
|
|
|
|
|
|
translation = get_translation(ori_content, ori_title)
|
|
|
|
|
|
llm_content = translation.get("llm_content", "") if translation else ""
|
|
|
|
|
|
llm_title = translation.get("llm_title", "") if translation else ""
|
|
|
|
|
|
if not is_mostly_chinese(llm_title, threshold=0.4):
|
|
|
|
|
|
print("新闻标题没有被翻译成中文, 会被舍弃, LLM标题为: ", llm_title)
|
|
|
|
|
|
llm_title = ""
|
|
|
|
|
|
continue
|
|
|
|
|
|
if not is_mostly_chinese(llm_content, threshold=0.5):
|
|
|
|
|
|
print("新闻正文没有被翻译成中文, 会被舍弃, LLM正文为: ", llm_content[:30] + "...")
|
|
|
|
|
|
llm_content = ""
|
|
|
|
|
|
continue
|
|
|
|
|
|
# 20250922新增 海外事件 海外宏观 中国宏观 行业新闻 公司新闻 转载来源 公司名称 等精细处理字段
|
|
|
|
|
|
overseas_event = translation.get("overseas_event", None) if translation else None
|
|
|
|
|
|
overseas_macro = translation.get("overseas_macro", None) if translation else None
|
|
|
|
|
|
china_macro = translation.get("china_macro", None) if translation else None
|
|
|
|
|
|
industry_news = translation.get("industry_news", None) if translation else None
|
|
|
|
|
|
company_news = translation.get("company_news", None) if translation else None
|
|
|
|
|
|
reprint_source = translation.get("reprint_source", None) if translation else None
|
|
|
|
|
|
if reprint_source == "":
|
|
|
|
|
|
reprint_source = None
|
|
|
|
|
|
company_name = translation.get("company_name", None) if translation else None
|
|
|
|
|
|
if company_name:
|
|
|
|
|
|
# 打个补丁, 让公司析出数强制不超过3 2025-12-19
|
|
|
|
|
|
company_name_list = company_name.strip().split(",")
|
|
|
|
|
|
if len(company_name_list) > 3:
|
|
|
|
|
|
company_name = ",".join(company_name_list[:3])
|
|
|
|
|
|
if company_name == "":
|
|
|
|
|
|
company_name = None
|
|
|
|
|
|
## 行业和之前的标签冗余,暂时去掉
|
|
|
|
|
|
# industry_name = translation.get("industry_name", None) if translation else None
|
|
|
|
|
|
|
|
|
|
|
|
# 20251217新增 修改了翻译的一些规则; 新增通达信行业标签及置信度
|
|
|
|
|
|
tdx_industry_ori = translation.get("tdx_industry", None) if translation else None
|
|
|
|
|
|
tdx_industry_confidence_ori = translation.get("tdx_industry_confidence", None) if translation else None
|
|
|
|
|
|
tdx_map = get_tdx_map()
|
|
|
|
|
|
tdx_industry = []
|
|
|
|
|
|
tdx_industry_confidence = []
|
|
|
|
|
|
if tdx_industry_ori and tdx_industry_confidence_ori and len(tdx_industry_confidence_ori) == len(tdx_industry_ori):
|
|
|
|
|
|
for industry, confidence in zip(tdx_industry_ori, tdx_industry_confidence_ori):
|
|
|
|
|
|
if industry in tdx_map:
|
|
|
|
|
|
tdx_industry.append(industry)
|
|
|
|
|
|
tdx_industry_confidence.append(float(confidence))
|
|
|
|
|
|
if len(tdx_industry) == 3:
|
|
|
|
|
|
break
|
|
|
|
|
|
# 20251226 新增 增加了一项析出A股代码的操作, 列表形式返回A股上市公司6位代码,至多5个
|
|
|
|
|
|
stock_codes_ori = translation.get("stock_codes", None) if translation else None
|
|
|
|
|
|
if stock_codes_ori:
|
|
|
|
|
|
stock_codes_ori = list(set(stock_codes_ori))
|
|
|
|
|
|
stock_codes = []
|
|
|
|
|
|
stock_names = []
|
|
|
|
|
|
stock_map = get_stock_map()
|
|
|
|
|
|
if stock_codes_ori and isinstance(stock_codes_ori, list):
|
|
|
|
|
|
for stock_code in stock_codes_ori:
|
|
|
|
|
|
if stock_code in stock_map:
|
|
|
|
|
|
stock_name = stock_map.get(stock_code)
|
|
|
|
|
|
stock_codes.append(stock_code)
|
|
|
|
|
|
stock_names.append(stock_name)
|
|
|
|
|
|
|
|
|
|
|
|
# 20260128 新增审核, 返回政治敏感度、出口转内销两项打分
|
|
|
|
|
|
political_sensitivity = translation.get("political_sensitivity", None) if translation else None
|
|
|
|
|
|
export_domestic = translation.get("export_domestic", None) if translation else None
|
|
|
|
|
|
political_notes = translation.get("political_notes", None) if translation else None
|
|
|
|
|
|
additional_notes = translation.get("additional_notes", None) if translation else None
|
|
|
|
|
|
|
|
|
|
|
|
# 20260226 新增大模型摘要
|
|
|
|
|
|
llm_abstract = translation.get("llm_abstract", None) if translation else None
|
|
|
|
|
|
if llm_content and llm_title and len(llm_content) > 30 and len(llm_title) > 3 and overseas_event is not None and overseas_macro is not None:
|
|
|
|
|
|
break
|
|
|
|
|
|
|
|
|
|
|
|
# 精选新闻添加etf
|
|
|
|
|
|
etf_labels = get_etf_labels(title=llm_title, industry_label=tagged_news.get("industry_label", []), industry_confidence=tagged_news.get("industry_confidence", []), concept_label=tagged_news.get("concept_label", []), concept_confidence=tagged_news.get("concept_confidence", []))
|
|
|
|
|
|
etf_names = [etf_name[label] for label in etf_labels]
|
|
|
|
|
|
tagged_news["etf_labels"] = etf_labels
|
|
|
|
|
|
tagged_news["etf_names"] = etf_names
|
|
|
|
|
|
|
|
|
|
|
|
tagged_news["title"] = llm_title
|
|
|
|
|
|
tagged_news["rewrite_content"] = llm_content
|
|
|
|
|
|
tagged_news["overseas_event"] = overseas_event
|
|
|
|
|
|
tagged_news["overseas_macro"] = overseas_macro
|
|
|
|
|
|
tagged_news["china_macro"] = china_macro
|
|
|
|
|
|
tagged_news["industry_news"] = industry_news
|
|
|
|
|
|
tagged_news["company_news"] = company_news
|
|
|
|
|
|
tagged_news["reprint_source"] = reprint_source
|
|
|
|
|
|
tagged_news["company_name"] = company_name
|
|
|
|
|
|
|
|
|
|
|
|
tagged_news["tdx_industry"] = tdx_industry if tdx_industry else None
|
|
|
|
|
|
tagged_news["tdx_industry_confidence"] = tdx_industry_confidence if tdx_industry_confidence else None
|
|
|
|
|
|
|
|
|
|
|
|
tagged_news["stock_codes"] = stock_codes if stock_codes else None
|
|
|
|
|
|
tagged_news["stock_names"] = stock_names if stock_names else None
|
|
|
|
|
|
|
|
|
|
|
|
tagged_news["political_sensitivity"] = political_sensitivity if political_sensitivity else None
|
|
|
|
|
|
tagged_news["export_domestic"] = export_domestic if export_domestic else None
|
|
|
|
|
|
tagged_news["political_notes"] = political_notes if political_notes else None
|
|
|
|
|
|
tagged_news["additional_notes"] = additional_notes if additional_notes else None
|
|
|
|
|
|
|
|
|
|
|
|
tagged_news["llm_abstract"] = llm_abstract if llm_abstract else None
|
|
|
|
|
|
|
2025-06-11 10:49:55 +08:00
|
|
|
|
tagged_news["source"] = source
|
|
|
|
|
|
tagged_news["source_impact"] = source_impact
|
|
|
|
|
|
tagged_news["industry_score"] = industry_score
|
|
|
|
|
|
tagged_news["concept_score"] = concept_score
|
|
|
|
|
|
tagged_news["news_score"] = news_score
|
2026-03-05 17:34:29 +08:00
|
|
|
|
tagged_news["news_score_exp"] = news_score_exp
|
|
|
|
|
|
tagged_news["center_news_id"] = center_news_id
|
2025-06-11 10:49:55 +08:00
|
|
|
|
tagged_news["id"] = id_str
|
2026-03-05 17:34:29 +08:00
|
|
|
|
|
|
|
|
|
|
tagged_news["cluster_score"] = cluster_score
|
2025-06-11 10:49:55 +08:00
|
|
|
|
tagged_news["industry_confidence"] = industry_confidence
|
|
|
|
|
|
tagged_news["concept_confidence"] = concept_confidence
|
2026-03-05 17:34:29 +08:00
|
|
|
|
|
|
|
|
|
|
if news_score >= 80:
|
|
|
|
|
|
# 保险起见对大模型翻译内容再做一遍敏感词筛查 20260128
|
|
|
|
|
|
news_title_llm = tagged_news.get("title", "")
|
|
|
|
|
|
news_content_llm = tagged_news.get("rewrite_content", "")
|
|
|
|
|
|
news_text_llm = f"{news_title_llm} {news_content_llm}"
|
|
|
|
|
|
result = sensitive_filter.detect(news_text_llm)
|
|
|
|
|
|
if result:
|
|
|
|
|
|
print("大模型翻译内容命中敏感词: ", ", ".join(list(result)), "; 分数强制8折")
|
|
|
|
|
|
tagged_news["news_score"] = news_score * 0.79
|
|
|
|
|
|
|
|
|
|
|
|
# export_domestic >= 80 或者 political_sensitivity >= 70 也直接移除精选池
|
|
|
|
|
|
if tagged_news.get("export_domestic", 0) >= 80 or tagged_news.get("political_sensitivity", 0) >= 70:
|
|
|
|
|
|
print(f'本条资讯的出口转内销得分为{tagged_news.get("export_domestic", 0)}, 政治敏感得分为{tagged_news.get("political_sensitivity", 0)}, political_notes信息为【{tagged_news.get("political_notes", "")}】, 分数8折')
|
|
|
|
|
|
tagged_news["news_score"] = news_score * 0.79
|
2025-06-11 11:06:50 +08:00
|
|
|
|
|
2025-06-11 10:49:55 +08:00
|
|
|
|
#print(json.dumps(tagged_news, ensure_ascii=False))
|
2026-03-05 17:34:29 +08:00
|
|
|
|
print(tagged_news["id"], tagged_news["news_score"], tagged_news["news_score_exp"], tagged_news["industry_label"], tagged_news["concept_label"], public_opinion_score, tagged_news["cluster_score"], tagged_news["center_news_id"], input_date)
|
|
|
|
|
|
if news_score >= 80:
|
|
|
|
|
|
print("LLM translation: ", tagged_news.get("rewrite_content", "")[:100])
|
|
|
|
|
|
print("LLM overseas_event score: ", tagged_news.get("overseas_event", None))
|
|
|
|
|
|
print("LLM overseas_macro score: ", tagged_news.get("overseas_macro", None))
|
|
|
|
|
|
print("LLM china_macro score: ", tagged_news.get("china_macro", None))
|
|
|
|
|
|
if tagged_news.get("reprint_source", None):
|
|
|
|
|
|
print("LLM reprint_source: ", tagged_news.get("reprint_source", None))
|
|
|
|
|
|
if tagged_news.get("company_name", None):
|
|
|
|
|
|
print("LLM company_name: ", tagged_news.get("company_name", None), tagged_news.get("company_news", None))
|
|
|
|
|
|
print("ETF NAMES: ", tagged_news.get("etf_names", []))
|
|
|
|
|
|
if tagged_news.get("tdx_industry", None):
|
|
|
|
|
|
print("LLM_max tdx_industry: ", tagged_news.get("tdx_industry", None))
|
|
|
|
|
|
if tagged_news.get("tdx_industry_confidence", None):
|
|
|
|
|
|
print("LLM_max tdx_industry_confidence: ", tagged_news.get("tdx_industry_confidence", None))
|
|
|
|
|
|
if tagged_news.get("stock_names", None):
|
|
|
|
|
|
print("LLM_max stock codes & names: ", tagged_news.get("stock_codes", None), tagged_news.get("stock_names", None))
|
|
|
|
|
|
if tagged_news.get("political_sensitivity", None):
|
|
|
|
|
|
print("LLM_max political_sensitivity: ", tagged_news.get("political_sensitivity", None))
|
|
|
|
|
|
if tagged_news.get("llm_abstract", None):
|
|
|
|
|
|
print("LLM_max llm_abstract: ", tagged_news.get("llm_abstract", None)[:30])
|
2025-06-11 10:49:55 +08:00
|
|
|
|
return tagged_news, True
|
|
|
|
|
|
|
|
|
|
|
|
except Exception as e:
|
|
|
|
|
|
print(f"处理消息时出错: {str(e)}")
|
|
|
|
|
|
# 处理失败,从已处理集合中移除
|
|
|
|
|
|
with processed_ids_lock:
|
|
|
|
|
|
if id_str in processed_ids:
|
|
|
|
|
|
processed_ids.remove(id_str)
|
|
|
|
|
|
return None, False
|
|
|
|
|
|
|
|
|
|
|
|
def process_message_batch(batch):
|
|
|
|
|
|
start_time = time.time()
|
|
|
|
|
|
"""并行处理一批消息"""
|
|
|
|
|
|
results = []
|
|
|
|
|
|
# 使用线程池并行处理
|
|
|
|
|
|
with ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor:
|
|
|
|
|
|
futures = []
|
|
|
|
|
|
for data in batch:
|
|
|
|
|
|
futures.append(executor.submit(process_single_message, data))
|
|
|
|
|
|
|
|
|
|
|
|
for future in futures:
|
|
|
|
|
|
try:
|
|
|
|
|
|
result, success = future.result()
|
|
|
|
|
|
if result:
|
|
|
|
|
|
results.append(result)
|
|
|
|
|
|
except Exception as e:
|
|
|
|
|
|
print(f"处理消息时发生异常: {str(e)}")
|
|
|
|
|
|
|
|
|
|
|
|
# 发送处理结果到MQ
|
|
|
|
|
|
for result in results:
|
|
|
|
|
|
try:
|
|
|
|
|
|
send_mq(result)
|
|
|
|
|
|
except Exception as e:
|
|
|
|
|
|
print(f"发送消息到MQ失败: {str(e)}")
|
|
|
|
|
|
|
|
|
|
|
|
duration = time.time() - start_time
|
|
|
|
|
|
print(f"批量处理 {len(batch)} 条消息, 耗时: {duration:.2f}s, "
|
|
|
|
|
|
f"平均: {duration/len(batch):.3f}s/条")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def message_callback(ch, method, properties, body):
|
|
|
|
|
|
"""消息处理回调函数(只负责入队)"""
|
|
|
|
|
|
try:
|
|
|
|
|
|
data = json.loads(body)
|
|
|
|
|
|
# 将消息和delivery_tag一起放入队列
|
|
|
|
|
|
message_queue.put((data, method.delivery_tag))
|
|
|
|
|
|
except Exception as e:
|
|
|
|
|
|
print(f"消息处理失败: {str(e)}")
|
|
|
|
|
|
# 拒绝消息, 不重新入队
|
|
|
|
|
|
ch.basic_nack(delivery_tag=method.delivery_tag, requeue=False)
|
|
|
|
|
|
|
|
|
|
|
|
def create_connection():
|
|
|
|
|
|
"""创建并返回RabbitMQ连接"""
|
|
|
|
|
|
credentials = pika.PlainCredentials(mq_user, mq_password)
|
|
|
|
|
|
return pika.BlockingConnection(
|
|
|
|
|
|
pika.ConnectionParameters(
|
|
|
|
|
|
host="localhost",
|
|
|
|
|
|
credentials=credentials,
|
|
|
|
|
|
heartbeat=600,
|
|
|
|
|
|
connection_attempts=3,
|
|
|
|
|
|
retry_delay=5 # 重试延迟5秒
|
|
|
|
|
|
)
|
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
|
|
def start_consumer():
|
|
|
|
|
|
"""启动MQ消费者(批量版本)"""
|
|
|
|
|
|
while True:
|
|
|
|
|
|
try:
|
|
|
|
|
|
connection = create_connection()
|
|
|
|
|
|
channel = connection.channel()
|
|
|
|
|
|
|
|
|
|
|
|
# 设置QoS,一次预取足够数量的消息
|
|
|
|
|
|
channel.basic_qos(prefetch_count=BATCH_SIZE * 3)
|
|
|
|
|
|
|
|
|
|
|
|
channel.exchange_declare(
|
|
|
|
|
|
exchange="zzck_exchange",
|
|
|
|
|
|
exchange_type="fanout"
|
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
|
|
# 声明队列
|
|
|
|
|
|
res = channel.queue_declare(queue="to_ai")
|
|
|
|
|
|
# res = channel.queue_declare(queue='', exclusive=True)
|
|
|
|
|
|
mq_queue = res.method.queue
|
|
|
|
|
|
channel.queue_bind(
|
|
|
|
|
|
exchange="zzck_exchange",
|
|
|
|
|
|
queue=mq_queue,
|
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
|
|
# 启动消费,关闭自动ACK
|
|
|
|
|
|
channel.basic_consume(
|
|
|
|
|
|
queue=mq_queue,
|
|
|
|
|
|
on_message_callback=message_callback,
|
|
|
|
|
|
auto_ack=False
|
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
|
|
print(f"消费者已启动,批量大小: {BATCH_SIZE}, 工作线程: {MAX_WORKERS}, 等待消息...")
|
|
|
|
|
|
|
|
|
|
|
|
last_process_time = time.time()
|
|
|
|
|
|
# 主循环
|
|
|
|
|
|
while True:
|
|
|
|
|
|
# 处理网络事件
|
|
|
|
|
|
connection.process_data_events(time_limit=0.1) # 非阻塞处理
|
|
|
|
|
|
|
|
|
|
|
|
current_time = time.time()
|
|
|
|
|
|
|
|
|
|
|
|
queue_size = message_queue.qsize()
|
|
|
|
|
|
# 双重触发机制:达到批量大小或超过处理间隔
|
|
|
|
|
|
if queue_size >= BATCH_SIZE or \
|
|
|
|
|
|
(current_time - last_process_time >= PROCESS_INTERVAL and queue_size >= MIN_BATCH_SIZE):
|
|
|
|
|
|
|
|
|
|
|
|
batch = []
|
|
|
|
|
|
delivery_tags = []
|
|
|
|
|
|
# 获取一批消息(最多BATCH_SIZE条)
|
|
|
|
|
|
while not message_queue.empty() and len(batch) < BATCH_SIZE:
|
|
|
|
|
|
data, delivery_tag = message_queue.get()
|
|
|
|
|
|
batch.append(data)
|
|
|
|
|
|
delivery_tags.append(delivery_tag)
|
|
|
|
|
|
|
|
|
|
|
|
if batch:
|
|
|
|
|
|
# 处理批量消息
|
|
|
|
|
|
process_message_batch(batch)
|
|
|
|
|
|
|
|
|
|
|
|
# 确认消息
|
|
|
|
|
|
for tag in delivery_tags:
|
|
|
|
|
|
channel.basic_ack(tag)
|
|
|
|
|
|
|
|
|
|
|
|
last_process_time = current_time
|
|
|
|
|
|
|
|
|
|
|
|
# 如果队列很小但等待时间过长,确保不会永远不处理
|
|
|
|
|
|
elif current_time - last_process_time >= PROCESS_INTERVAL * 5 and queue_size > 0:
|
|
|
|
|
|
# 处理剩余的所有消息
|
|
|
|
|
|
batch = []
|
|
|
|
|
|
delivery_tags = []
|
|
|
|
|
|
while not message_queue.empty():
|
|
|
|
|
|
data, delivery_tag = message_queue.get()
|
|
|
|
|
|
batch.append(data)
|
|
|
|
|
|
delivery_tags.append(delivery_tag)
|
|
|
|
|
|
|
|
|
|
|
|
if batch:
|
|
|
|
|
|
process_message_batch(batch)
|
|
|
|
|
|
for tag in delivery_tags:
|
|
|
|
|
|
channel.basic_ack(tag)
|
|
|
|
|
|
last_process_time = current_time
|
|
|
|
|
|
|
|
|
|
|
|
# 检查连接是否关闭
|
|
|
|
|
|
if not connection or connection.is_closed:
|
|
|
|
|
|
break
|
|
|
|
|
|
|
|
|
|
|
|
except pika.exceptions.ConnectionClosedByBroker:
|
|
|
|
|
|
print("连接被代理关闭,将在5秒后重试...")
|
|
|
|
|
|
time.sleep(5)
|
|
|
|
|
|
except pika.exceptions.AMQPConnectionError:
|
|
|
|
|
|
print("连接失败,将在10秒后重试...")
|
|
|
|
|
|
time.sleep(10)
|
|
|
|
|
|
except KeyboardInterrupt:
|
|
|
|
|
|
print("消费者被用户中断")
|
|
|
|
|
|
try:
|
|
|
|
|
|
if connection and connection.is_open:
|
|
|
|
|
|
connection.close()
|
|
|
|
|
|
except:
|
|
|
|
|
|
pass
|
|
|
|
|
|
break
|
|
|
|
|
|
except Exception as e:
|
|
|
|
|
|
print(f"消费者异常: {str(e)}")
|
|
|
|
|
|
print("将在15秒后重试...")
|
|
|
|
|
|
time.sleep(15)
|
|
|
|
|
|
finally:
|
|
|
|
|
|
try:
|
|
|
|
|
|
if connection and connection.is_open:
|
|
|
|
|
|
connection.close()
|
|
|
|
|
|
except:
|
|
|
|
|
|
pass
|
|
|
|
|
|
|
|
|
|
|
|
if __name__ == "__main__":
|
|
|
|
|
|
start_consumer()
|