selenium_keyan/selenium/parseApi/api.py

import asyncio
import aiohttp
import json
import sys
import os
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
from config import api_info
import asyncio
import aiohttp
import json
from config import api_info
from collections import defaultdict
# ======================
# 调用大模型 API
# ======================
async def call_model_api(prompt):
    """
    异步调用 Nuwa Chat Completions API 返回文本输出
    """
    url = f"{api_info['base_url']}/chat/completions"
    headers = {
        "Content-Type": "application/json",
        "Authorization": f"Bearer {api_info['api_key']}"
    }
    payload = {
        "model": api_info["model"],
        "messages": [{"role": "user", "content": prompt}],
        "thinking": {
            "type": "disabled"
         },
        "max_output_tokens": 1024
    }

    async with aiohttp.ClientSession() as session:
        try:
            async with session.post(url, headers=headers, json=payload, timeout=60) as resp:
                if resp.status == 200:
                    result = await resp.json()
                    # 获取模型输出
                    text = result.get("choices", [{}])[0].get("message", {}).get("content", "")
                    return text
                else:
                    print(f"[ERROR] 请求失败: {resp.status} {await resp.text()}")
                    return ""
        except Exception as e:
            print(f"[ERROR] 请求异常: {e}")
            return ""

# ======================
# 异步解析每篇论文
# ======================
async def parse_paper(paper):
    title = paper.get("title") or paper.get("Conference", "")
    summary = paper.get("summary", "")
    keywords = paper.get("keywords", [])

    # Prompt 完整规范
    model_prompt = f"""
    你是一个科研助手，请根据以下信息分析论文内容，并提炼关键信息总结成 JSON 格式。要求：
    1. 输出 JSON 格式，字段包含：
       - background: 论文背景，简明说明研究动机和问题，不抄原文摘要。
       - objective: 研究目标，逻辑上支撑方法和贡献，如果有多个目标，每条编号从 1 开始，如 "1. …", "2. …"。
       - method: 研究方法，说明论文如何实现目标，逻辑上与目标和贡献连贯，如果有多条方法，每条编号从 1 开始。
       - results: 核心结论，概括论文主要结果。
       - contribution: 论文贡献总结，总结通过方法解决目标得到的价值与创新点，如果有多条贡献，每条编号从 1 开始。
    2. **要求分析提炼，而非复述原文摘要**：  
       - 用你自己的理解重组信息  
       - 确保逻辑顺序：objective → method → contribution  
       - 精炼、一针见血，但保持完整信息  
    
    3. 如果某一项无法从信息中提取，请置空 ""。  
    4. 输出 JSON 时严格遵循字段名称，不添加额外解释文字。 

    示例输入：
    Title: Analyzing the Basic Elements of Mobile Viral Marketing-An Empirical Study
    Summary: As personal communication tools mobile devices are platforms for word-of-mouth marketing. Given the assigned usefulness of mobile viral marketing, it is surprising to find relatively few studies directed at its basic elements, i.e., mobile viral content and consumers forwarding this content. The paper presents the findings of an online survey conducted to empirically investigate the consumers' intention to participate in different kinds of mobile viral marketing strategies and to identify the characteristics of mobile viral mavens in terms of their forwarding behaviour.
    Keywords: mobile marketing, viral marketing, consumer behavior

    示例输出：
    {{
        "background": "移动设备为口碑传播提供了新渠道，但关于病毒营销基本元素的研究仍较少。",
        "objective": "1. 分析移动病毒营销的核心组成及消费者转发行为，理解不同策略对参与意向的影响。",
        "method": "2. 设计并实施在线问卷调查，收集消费者行为数据，并进行实证分析以验证策略效果。",
        "results": "发现消费者对不同类型的移动病毒营销策略表现出不同的参与意向。",
        "contribution": "3. 提炼移动病毒营销的关键元素及转发行为模式，为营销策略优化提供参考。"
    }}

    现在请根据以下信息生成 JSON：
    Title: {title}
    Summary: {summary}
    Keywords: {', '.join(keywords)}
    """

    try:
        model_output = await call_model_api(model_prompt)
        parsed = json.loads(model_output) if model_output else {
            "background": "",
            "objective": "",
            "method": "",
            "results": "",
            "contribution": ""
        }
    except Exception:
        parsed = {
            "background": "",
            "objective": "",
            "method": "",
            "results": "",
            "contribution": ""
        }

    paper_parsed = paper.copy()
    paper_parsed["parsed_summary"] = parsed
    return paper_parsed


async def parse_ieee_results_all_categories_async(json_data):
    """
    解析抓取结果的所有分类，并且去重重复文章（title相同的只解析一次）。
    使用缓存机制避免重复解析同一篇文章。
    """
    # 支持传入两种格式
    results = json_data.get("results", {}) if "results" in json_data else json_data

    parsed_results = defaultdict(list)  # 最终返回结果，分类对应列表
    seen_titles = set()  # 全局去重，防止重复解析
    cache = {}  # 缓存已解析文章：title -> 解析后的数据

    for category, papers in results.items():
        tasks = []
        for paper in papers:
            title = paper.get("title") or paper.get("Conference", "")
            if title in seen_titles:
                # 已解析过，直接复用缓存
                tasks.append(asyncio.sleep(0, result=cache[title]))
            else:
                # 新文章，加入任务并记录
                seen_titles.add(title)
                task = asyncio.create_task(parse_paper(paper))
                tasks.append(task)

        if tasks:
            parsed_papers = await asyncio.gather(*tasks)
            # 保存到缓存并添加到对应分类
            for parsed_paper in parsed_papers:
                t = parsed_paper.get("title") or parsed_paper.get("Conference", "")
                cache[t] = parsed_paper
                parsed_results[category].append(parsed_paper)

    return dict(parsed_results)