42 lines
1.3 KiB
Python
42 lines
1.3 KiB
Python
import os
|
|
from urllib.parse import urlparse, urljoin
|
|
import time
|
|
import pymysql
|
|
from pathlib import Path
|
|
import logging
|
|
from apscheduler.schedulers.blocking import BlockingScheduler
|
|
from apscheduler.events import EVENT_JOB_EXECUTED, EVENT_JOB_ERROR
|
|
import json
|
|
import re
|
|
from config import * # 导入配置
|
|
import datetime
|
|
|
|
def clean_control_characters(text):
|
|
return re.sub(r'[\x00-\x1F\x7F-\x9F]', '', text)
|
|
# 修改文件编码为UTF-16
|
|
with open("data\\2025-05-20\\01-11-04.txt", 'r', encoding='utf-16') as f: # 修改编码方式
|
|
raw_data = f.read()
|
|
cleaned_data = clean_control_characters(raw_data)
|
|
data_list = json.loads(cleaned_data)
|
|
for data in data_list: # 遍历数组中的每个对象
|
|
# 处理特殊字符转义
|
|
en_content = data['EN_content'].replace('quot;', '"')
|
|
for k,v in data.items():
|
|
print(k)
|
|
values = (
|
|
data['id'],
|
|
data['input_date'],
|
|
data['words'],
|
|
data['title_txt'],
|
|
data['key_word'],
|
|
data['CN_content'],
|
|
en_content, # 使用处理后的英文内容
|
|
data['URL'],
|
|
data['abstract'],
|
|
data['title_EN'],
|
|
data.get('data_source', 'zzck'),
|
|
json.dumps(data['c']),
|
|
0,
|
|
|
|
)
|
|
|