60 lines
1.8 KiB
Plaintext
60 lines
1.8 KiB
Plaintext
![]() |
def decrypt_to_jsonl(self, excel_file=None, output_file=None):
|
||
|
import shutil
|
||
|
from multiprocessing import Pool
|
||
|
from tqdm import tqdm
|
||
|
import subprocess
|
||
|
import pandas as pd
|
||
|
import os
|
||
|
|
||
|
# 默认使用当天目录下的 chat.xlsx
|
||
|
if excel_file is None:
|
||
|
excel_file = os.path.join(self.today_dir, 'chat.xlsx')
|
||
|
|
||
|
if not os.path.exists(excel_file):
|
||
|
print(f"错误:找不到 {excel_file}")
|
||
|
return
|
||
|
|
||
|
# 默认输出到当天目录
|
||
|
if output_file is None:
|
||
|
output_file = os.path.join(self.today_dir, 'chatdata.jsonl')
|
||
|
|
||
|
# 如果已存在旧文件,先删除
|
||
|
if os.path.exists(output_file):
|
||
|
os.remove(output_file)
|
||
|
|
||
|
print(f"开始加载文件 {excel_file} ……💕")
|
||
|
df = pd.read_excel(excel_file, engine='openpyxl')
|
||
|
|
||
|
# --- 核心修改:提前去重加密字段,避免重复解密 ---
|
||
|
df = df.drop_duplicates(subset=['encrypt_key', 'encrypt_chat_msg'])
|
||
|
|
||
|
encrypt_key = df['encrypt_key'].tolist()
|
||
|
encrypt_chat_msg = df['encrypt_chat_msg'].tolist()
|
||
|
|
||
|
print("开始解密聊天记录……💕")
|
||
|
|
||
|
def _process(i):
|
||
|
result = subprocess.run(
|
||
|
[self.sdktools_path, '3', encrypt_key[i], encrypt_chat_msg[i]],
|
||
|
stdout=subprocess.PIPE
|
||
|
)
|
||
|
return result.stdout.decode('utf-8').strip()
|
||
|
|
||
|
# 多进程解密
|
||
|
with Pool() as p:
|
||
|
results = list(tqdm(
|
||
|
p.map(_process, range(len(encrypt_key))),
|
||
|
total=len(encrypt_key),
|
||
|
desc='Processing'
|
||
|
))
|
||
|
|
||
|
# 再次去重,防止 SDK 输出微小差异造成重复
|
||
|
unique_results = list(dict.fromkeys(results))
|
||
|
|
||
|
# 写入 JSONL
|
||
|
with open(output_file, 'w', encoding='utf-8') as f:
|
||
|
for line in unique_results:
|
||
|
f.write(line + '\n')
|
||
|
|
||
|
print(f"数据处理完成 ✔ 已保存到 {output_file}")
|