sdk/AAAA.PY

60 lines
1.8 KiB
Python

def decrypt_to_jsonl(self, excel_file=None, output_file=None):
import shutil
from multiprocessing import Pool
from tqdm import tqdm
import subprocess
import pandas as pd
import os
# 默认使用当天目录下的 chat.xlsx
if excel_file is None:
excel_file = os.path.join(self.today_dir, 'chat.xlsx')
if not os.path.exists(excel_file):
print(f"错误:找不到 {excel_file}")
return
# 默认输出到当天目录
if output_file is None:
output_file = os.path.join(self.today_dir, 'chatdata.jsonl')
# 如果已存在旧文件,先删除
if os.path.exists(output_file):
os.remove(output_file)
print(f"开始加载文件 {excel_file} ……💕")
df = pd.read_excel(excel_file, engine='openpyxl')
# --- 核心修改:提前去重加密字段,避免重复解密 ---
df = df.drop_duplicates(subset=['encrypt_key', 'encrypt_chat_msg'])
encrypt_key = df['encrypt_key'].tolist()
encrypt_chat_msg = df['encrypt_chat_msg'].tolist()
print("开始解密聊天记录……💕")
def _process(i):
result = subprocess.run(
[self.sdktools_path, '3', encrypt_key[i], encrypt_chat_msg[i]],
stdout=subprocess.PIPE
)
return result.stdout.decode('utf-8').strip()
# 多进程解密
with Pool() as p:
results = list(tqdm(
p.map(_process, range(len(encrypt_key))),
total=len(encrypt_key),
desc='Processing'
))
# 再次去重,防止 SDK 输出微小差异造成重复
unique_results = list(dict.fromkeys(results))
# 写入 JSONL
with open(output_file, 'w', encoding='utf-8') as f:
for line in unique_results:
f.write(line + '\n')
print(f"数据处理完成 ✔ 已保存到 {output_file}")