import os from urllib.parse import urlparse, urljoin import time import pymysql from pathlib import Path import logging from apscheduler.schedulers.blocking import BlockingScheduler from apscheduler.events import EVENT_JOB_EXECUTED, EVENT_JOB_ERROR import json import re from config import * # 导入配置 import datetime def clean_control_characters(text): return re.sub(r'[\x00-\x1F\x7F-\x9F]', '', text) # 修改文件编码为UTF-16 with open("data\\2025-05-20\\01-11-04.txt", 'r', encoding='utf-16') as f: # 修改编码方式 raw_data = f.read() cleaned_data = clean_control_characters(raw_data) data_list = json.loads(cleaned_data) for data in data_list: # 遍历数组中的每个对象 # 处理特殊字符转义 en_content = data['EN_content'].replace('quot;', '"') for k,v in data.items(): print(k) values = ( data['id'], data['input_date'], data['words'], data['title_txt'], data['key_word'], data['CN_content'], en_content, # 使用处理后的英文内容 data['URL'], data['abstract'], data['title_EN'], data.get('data_source', 'zzck'), json.dumps(data['c']), 0, )