zzck/test.py

42 lines
1.3 KiB
Python

import os
from urllib.parse import urlparse, urljoin
import time
import pymysql
from pathlib import Path
import logging
from apscheduler.schedulers.blocking import BlockingScheduler
from apscheduler.events import EVENT_JOB_EXECUTED, EVENT_JOB_ERROR
import json
import re
from config import * # 导入配置
import datetime
def clean_control_characters(text):
return re.sub(r'[\x00-\x1F\x7F-\x9F]', '', text)
# 修改文件编码为UTF-16
with open("data\\2025-05-20\\01-11-04.txt", 'r', encoding='utf-16') as f: # 修改编码方式
raw_data = f.read()
cleaned_data = clean_control_characters(raw_data)
data_list = json.loads(cleaned_data)
for data in data_list: # 遍历数组中的每个对象
# 处理特殊字符转义
en_content = data['EN_content'].replace('quot;', '"')
for k,v in data.items():
print(k)
values = (
data['id'],
data['input_date'],
data['words'],
data['title_txt'],
data['key_word'],
data['CN_content'],
en_content, # 使用处理后的英文内容
data['URL'],
data['abstract'],
data['title_EN'],
data.get('data_source', 'zzck'),
json.dumps(data['c']),
0,
)