合并子仓库为普通文件夹,并重新添加所有文件
This commit is contained in:
parent
993763aceb
commit
80b80940c6
1
selenium
1
selenium
|
@ -1 +0,0 @@
|
|||
Subproject commit 26767ab8cc5227d81a027c9fb156459936699514
|
|
@ -0,0 +1,68 @@
|
|||
# --------------------------
|
||||
# 基础镜像
|
||||
# --------------------------
|
||||
FROM ubuntu:20.04
|
||||
ENV DEBIAN_FRONTEND=noninteractive
|
||||
WORKDIR /app
|
||||
|
||||
# --------------------------
|
||||
# 1. 在线安装系统依赖
|
||||
# --------------------------
|
||||
RUN apt-get update && apt-get install -y \
|
||||
xvfb \
|
||||
gdebi-core \
|
||||
python3 python3-pip \
|
||||
curl wget unzip \
|
||||
fonts-liberation \
|
||||
libappindicator3-1 \
|
||||
libasound2 \
|
||||
libatk1.0-0 \
|
||||
libatk-bridge2.0-0 \
|
||||
libcups2 \
|
||||
libdbus-1-3 \
|
||||
libgdk-pixbuf2.0-0 \
|
||||
libnspr4 \
|
||||
libnss3 \
|
||||
libx11-xcb1 \
|
||||
libxcomposite1 \
|
||||
libxdamage1 \
|
||||
libxrandr2 \
|
||||
libgbm1 \
|
||||
libpango-1.0-0 \
|
||||
libpangocairo-1.0-0 \
|
||||
ca-certificates \
|
||||
libvulkan1 \
|
||||
xdg-utils \
|
||||
&& rm -rf /var/lib/apt/lists/*
|
||||
|
||||
|
||||
|
||||
# --------------------------
|
||||
# 2. 复制 Chrome 和 Chromedriver (离线)
|
||||
# --------------------------
|
||||
COPY chrome/google-chrome-stable_140.0.7339.185-1_amd64.deb /tmp/chrome/
|
||||
COPY chrome/chromedriver /usr/local/bin/chromedriver
|
||||
RUN chmod +x /usr/local/bin/chromedriver
|
||||
|
||||
# 安装 Chrome
|
||||
RUN gdebi -n /tmp/chrome/google-chrome-stable_140.0.7339.185-1_amd64.deb \
|
||||
&& rm -rf /tmp/chrome
|
||||
|
||||
# --------------------------
|
||||
# 3. 在线安装 Python 依赖
|
||||
# --------------------------
|
||||
COPY requirements.txt .
|
||||
RUN pip3 install --no-cache-dir -r requirements.txt
|
||||
|
||||
# --------------------------
|
||||
# 4. 复制项目代码
|
||||
# --------------------------
|
||||
COPY . .
|
||||
|
||||
# --------------------------
|
||||
# 5. Flask 端口 & 入口
|
||||
# --------------------------
|
||||
EXPOSE 5000
|
||||
COPY entrypoint.sh /entrypoint.sh
|
||||
RUN chmod +x /entrypoint.sh
|
||||
CMD ["/entrypoint.sh"]
|
Binary file not shown.
Binary file not shown.
|
@ -0,0 +1,27 @@
|
|||
// Copyright 2015 The Chromium Authors
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without
|
||||
// modification, are permitted provided that the following conditions are
|
||||
// met:
|
||||
//
|
||||
// * Redistributions of source code must retain the above copyright
|
||||
// notice, this list of conditions and the following disclaimer.
|
||||
// * Redistributions in binary form must reproduce the above
|
||||
// copyright notice, this list of conditions and the following disclaimer
|
||||
// in the documentation and/or other materials provided with the
|
||||
// distribution.
|
||||
// * Neither the name of Google LLC nor the names of its
|
||||
// contributors may be used to endorse or promote products derived from
|
||||
// this software without specific prior written permission.
|
||||
//
|
||||
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
File diff suppressed because it is too large
Load Diff
Binary file not shown.
|
@ -0,0 +1,50 @@
|
|||
import time
|
||||
import random
|
||||
from selenium import webdriver
|
||||
from selenium.webdriver.chrome.service import Service
|
||||
import os
|
||||
api_info = {
|
||||
"model": "gpt-4.1-2025-04-14",
|
||||
"base_url": "https://api.nuwaapi.com/v1",
|
||||
"api_key": "sk-gZsDzmPpOh1UpVzLzkh9dP05v0nLv9iR0HCazhlO7ZNZ3Ier"
|
||||
}
|
||||
|
||||
# chrome浏览器以及驱动配置
|
||||
CHROME_BINARY_PATH = os.environ.get("CHROME_BINARY_PATH", "/usr/bin/google-chrome")
|
||||
CHROMEDRIVER_PATH = os.environ.get("CHROMEDRIVER_PATH", "/usr/local/bin/chromedriver")
|
||||
# 最大并发数
|
||||
MAX_CONCURRENT_BROWSERS = 3
|
||||
|
||||
#创建浏览器driver
|
||||
def create_browser():
|
||||
options = webdriver.ChromeOptions()
|
||||
options.binary_location = CHROME_BINARY_PATH
|
||||
# 每个实例随机调试端口
|
||||
options.add_argument(f"--remote-debugging-port={random.randint(9222, 9322)}")
|
||||
# options.add_argument("--headless=new")
|
||||
options.add_argument("--disable-gpu")
|
||||
options.add_argument("--disable-software-rasterizer")
|
||||
options.add_argument("--no-sandbox")
|
||||
options.add_argument("--disable-setuid-sandbox")
|
||||
options.add_argument("--window-size=1920,1080")
|
||||
options.add_argument("--disable-dev-shm-usage")
|
||||
options.add_argument("--disable-blink-features=AutomationControlled")
|
||||
options.add_experimental_option("excludeSwitches", ["enable-automation"])
|
||||
options.add_experimental_option("useAutomationExtension", False)
|
||||
prefs = {
|
||||
"download.prompt_for_download": False,
|
||||
"plugins.always_open_pdf_externally": True,
|
||||
"profile.default_content_setting_values.automatic_downloads": 1,
|
||||
"safebrowsing.enabled": True,
|
||||
"safebrowsing.disable_download_protection": True
|
||||
}
|
||||
options.add_experimental_option("prefs", prefs)
|
||||
return webdriver.Chrome(service=Service(CHROMEDRIVER_PATH), options=options)
|
||||
|
||||
|
||||
def _scroll_into_view(driver, el):
|
||||
try:
|
||||
driver.execute_script("arguments[0].scrollIntoView({block:'center', inline:'center'});", el)
|
||||
time.sleep(0.2)
|
||||
except Exception:
|
||||
pass
|
|
@ -0,0 +1,6 @@
|
|||
#!/bin/bash
|
||||
set -e
|
||||
echo "[INFO] Starting main.py under xvfb-run..."
|
||||
xvfb-run python3 main.py 2>&1 | tee /var/log/main.log || true
|
||||
echo "[INFO] main.py finished. Container will keep running for debugging..."
|
||||
tail -f /dev/null
|
|
@ -0,0 +1,228 @@
|
|||
# app.py
|
||||
print("[DEBUG] main.py started")
|
||||
import sys
|
||||
sys.stdout.flush()
|
||||
import json
|
||||
import threading
|
||||
import requests
|
||||
import asyncio
|
||||
import json
|
||||
import threading
|
||||
import math
|
||||
from flask import Flask, request, jsonify
|
||||
from utils.springerLink import springerLink # 你的爬虫接口
|
||||
from utils.arxiv import arxiv # 你的爬虫接口
|
||||
from utils.pubmed import pubmed # 你的爬虫接口
|
||||
from utils.wangfang import wangfang # 你的爬虫接口
|
||||
from utils.zhiwang import zhiwang # 你的爬虫接口
|
||||
from utils.weipu import weipu # 你的爬虫接口
|
||||
from utils.ieeeXplore import ieeeXplore
|
||||
from parseApi.api import parse_ieee_results_all_categories_async
|
||||
from flask_cors import CORS
|
||||
from config import MAX_CONCURRENT_BROWSERS,api_info
|
||||
app = Flask(__name__)
|
||||
CORS(app, resources={r"/*": {"origins": "*"}}, supports_credentials=True, allow_headers="*")
|
||||
# 允许所有跨域请求
|
||||
semaphore = threading.Semaphore(MAX_CONCURRENT_BROWSERS)
|
||||
# 假设 SITE_FUNCTIONS 分为中文网站和英文网站函数列表
|
||||
CHINESE_SITE_FUNCTIONS = [zhiwang, wangfang, weipu]
|
||||
ENGLISH_SITE_FUNCTIONS = [ieeeXplore, arxiv, pubmed]
|
||||
def translate_text(text):
|
||||
"""
|
||||
输入:
|
||||
text_input: 一句话或中文关键词列表 (str)
|
||||
api_info: dict, 包含 base_url, api_key, model
|
||||
输出:
|
||||
dict: {"chinese": [...], "english": [...]}
|
||||
"""
|
||||
if not text:
|
||||
return {"chinese": [], "english": []}
|
||||
|
||||
# 构造 prompt
|
||||
prompt = (
|
||||
"你是科研助手,输入是一句话或中文关键词列表。"
|
||||
"请从输入中理解语义,提取与科研论文主题最相关、最核心的中文主题,并翻译为英文。"
|
||||
"只保留1~2个最核心主题,不要加入无关内容。"
|
||||
"输出必须严格遵守 JSON 格式,不允许有额外文字或符号:{\"chinese\": [...], \"english\": [...]}。\n"
|
||||
"示例输入输出:\n"
|
||||
"输入: '我想获取基于深度学习的图像识别方面的研究'\n"
|
||||
"输出: {\"chinese\": [\"基于深度学习的图像识别\"], \"english\": [\"Deep Learning-based Image Recognition\"]}\n"
|
||||
"输入: '图像识别在深度学习方面的研究'\n"
|
||||
"输出: {\"chinese\": [\"基于深度学习的图像识别\"], \"english\": [\"Deep Learning-based Image Recognition\"]}\n"
|
||||
"输入: '自然语言处理模型在文本分类中的应用'\n"
|
||||
"输出: {\"chinese\": [\"自然语言处理文本分类\"], \"english\": [\"NLP Text Classification\"]}\n"
|
||||
"输入: '强化学习在自动驾驶决策中的最新进展'\n"
|
||||
"输出: {\"chinese\": [\"强化学习自动驾驶决策\"], \"english\": [\"Reinforcement Learning for Autonomous Driving Decision-Making\"]}\n"
|
||||
"输入: '使用图神经网络进行社交网络分析的研究'\n"
|
||||
"输出: {\"chinese\": [\"图神经网络社交网络分析\"], \"english\": [\"Graph Neural Networks for Social Network Analysis\"]}\n"
|
||||
"输入: '我想研究深度强化学习在机器人控制中的应用'\n"
|
||||
"输出: {\"chinese\": [\"深度强化学习机器人控制\"], \"english\": [\"Deep Reinforcement Learning for Robot Control\"]}\n"
|
||||
f"现在请对输入提取核心主题:\n输入: {text}"
|
||||
)
|
||||
|
||||
|
||||
url = f"{api_info['base_url']}/chat/completions"
|
||||
headers = {
|
||||
"Content-Type": "application/json",
|
||||
"Authorization": f"Bearer {api_info['api_key']}"
|
||||
}
|
||||
payload = {
|
||||
"model": api_info["model"],
|
||||
"messages": [{"role": "user", "content": prompt}],
|
||||
"max_output_tokens": 512
|
||||
}
|
||||
|
||||
try:
|
||||
resp = requests.post(url, headers=headers, json=payload, timeout=30)
|
||||
resp.raise_for_status()
|
||||
result = resp.json()
|
||||
|
||||
text_output = result.get("choices", [{}])[0].get("message", {}).get("content", "")
|
||||
if not text_output:
|
||||
return {"chinese": [text], "english": []}
|
||||
|
||||
try:
|
||||
parsed = json.loads(text_output)
|
||||
chinese = parsed.get("chinese", [text])
|
||||
english = parsed.get("english", [])
|
||||
return {"chinese": chinese, "english": english}
|
||||
except json.JSONDecodeError:
|
||||
return {"chinese": [text], "english": []}
|
||||
|
||||
except requests.RequestException as e:
|
||||
print(f"[ERROR] 请求失败: {e}")
|
||||
return {"chinese": [text], "english": []}
|
||||
async def crawl_single(keyword, site_func, limit, sort):
|
||||
loop = asyncio.get_event_loop()
|
||||
try:
|
||||
print(f"[DEBUG] Opening browser for {site_func.__name__} with keyword '{keyword}'")
|
||||
result = await loop.run_in_executor(
|
||||
None,
|
||||
lambda: site_func(keyword, limit, sort_options=sort)
|
||||
)
|
||||
print(f"[DEBUG] Finished crawling {site_func.__name__} with keyword '{keyword}'")
|
||||
return result
|
||||
except Exception as e:
|
||||
print(f"[ERROR] {site_func.__name__} with keyword '{keyword}' failed: {e}")
|
||||
return []
|
||||
async def crawl_and_parse(kw, site_func, limit, sort, parse_flag):
|
||||
try:
|
||||
results = await crawl_single(kw, site_func, limit, sort)
|
||||
if parse_flag and results:
|
||||
print("解析之前的数据:", results)
|
||||
parsed_results = await parse_ieee_results_all_categories_async(results)
|
||||
print(f"[DEBUG] 解析结果: {parsed_results}")
|
||||
return parsed_results or []
|
||||
return results or []
|
||||
except Exception as e:
|
||||
print(f"[ERROR] {site_func.__name__} with keyword '{kw}' failed: {e}")
|
||||
return []
|
||||
|
||||
|
||||
# crawl_all_keywords 不需要改太多,只需保持 semaphore 控制并发即可
|
||||
async def crawl_all_keywords(chinese_keywords, english_keywords, limit, sort, max_concurrent=MAX_CONCURRENT_BROWSERS, parse_flag=True):
|
||||
all_tasks = []
|
||||
|
||||
# 中文
|
||||
for kw in chinese_keywords:
|
||||
for func in CHINESE_SITE_FUNCTIONS:
|
||||
all_tasks.append((kw, func))
|
||||
# 英文
|
||||
for kw in english_keywords:
|
||||
for func in ENGLISH_SITE_FUNCTIONS:
|
||||
all_tasks.append((kw, func))
|
||||
|
||||
semaphore = asyncio.Semaphore(max_concurrent)
|
||||
|
||||
async def sem_task(kw, func):
|
||||
async with semaphore:
|
||||
return await crawl_and_parse(kw, func, limit, sort, parse_flag)
|
||||
|
||||
tasks = [sem_task(kw, func) for kw, func in all_tasks]
|
||||
all_results = await asyncio.gather(*tasks, return_exceptions=True)
|
||||
|
||||
final_results = []
|
||||
weipu_empty = [] # 记录哪些关键词的 weipu 结果为空
|
||||
|
||||
# 处理第一次抓取的结果
|
||||
for (kw, func), r in zip(all_tasks, all_results):
|
||||
if isinstance(r, dict):
|
||||
for category, papers in r.items():
|
||||
final_results.extend(papers)
|
||||
elif isinstance(r, list):
|
||||
final_results.extend(r)
|
||||
# 如果是 weipu 且返回空列表,记录下来
|
||||
if func is weipu and not r:
|
||||
weipu_empty.append(kw)
|
||||
|
||||
# ---- 仅增加的逻辑:对 weipu 结果为空的关键词重试 ----
|
||||
for kw in weipu_empty:
|
||||
try:
|
||||
print(f"[INFO] Weipu empty for '{kw}', retrying...")
|
||||
retry_res = await crawl_and_parse(kw, weipu, limit, sort, parse_flag)
|
||||
if isinstance(retry_res, dict):
|
||||
for category, papers in retry_res.items():
|
||||
final_results.extend(papers)
|
||||
elif isinstance(retry_res, list):
|
||||
final_results.extend(retry_res)
|
||||
except Exception as e:
|
||||
print(f"[ERROR] Weipu retry failed for '{kw}': {e}")
|
||||
# ---------------------------------------------------------
|
||||
|
||||
return final_results
|
||||
|
||||
@app.route("/crawl", methods=["POST", "OPTIONS"])
|
||||
def crawl():
|
||||
if request.method == "OPTIONS":
|
||||
return jsonify({"status": "ok"}), 200
|
||||
data = request.json
|
||||
if not data or "texts" not in data:
|
||||
return jsonify({"success": False, "error": "Missing 'texts' field"}), 400
|
||||
|
||||
text_input = data["texts"]
|
||||
parse_flag = data.get("parse", True)
|
||||
print("自然语言处理文本",text_input)
|
||||
sort = data.get("sort", ["relevance"])
|
||||
max_concurrent = int(data.get("max_concurrent", 3))
|
||||
|
||||
max_retries = 3
|
||||
translated = translate_text(text_input)
|
||||
chinese_keywords = translated.get("chinese", [])
|
||||
english_keywords = translated.get("english", [])
|
||||
|
||||
retry_count = 0
|
||||
while not english_keywords and retry_count < max_retries:
|
||||
retry_count += 1
|
||||
retry_translated = translate_text(text_input)
|
||||
# 中文关键词保留第一次或最新结果
|
||||
chinese_keywords = retry_translated.get("chinese", chinese_keywords)
|
||||
english_keywords = retry_translated.get("english", [])
|
||||
if english_keywords:
|
||||
break # 获取到英文关键词,停止重试
|
||||
|
||||
print(translated)
|
||||
|
||||
raw_limit = data.get("limit")
|
||||
if raw_limit is not None:
|
||||
raw_limit = int(raw_limit)
|
||||
total_tasks = len(chinese_keywords) * 3 + len(english_keywords) * 3
|
||||
limit = max(1, math.ceil(raw_limit / total_tasks)) # 每个网页的 limit 至少 1
|
||||
else:
|
||||
limit=10
|
||||
|
||||
loop = asyncio.new_event_loop()
|
||||
asyncio.set_event_loop(loop)
|
||||
|
||||
async def main():
|
||||
results = await crawl_all_keywords(chinese_keywords, english_keywords, limit, sort, max_concurrent, parse_flag)
|
||||
return results
|
||||
|
||||
try:
|
||||
final_results = loop.run_until_complete(main())
|
||||
return jsonify({"success": True, "results": final_results})
|
||||
except Exception as e:
|
||||
return jsonify({"success": False, "error": str(e)}), 500
|
||||
|
||||
if __name__ == "__main__":
|
||||
app.run(host="0.0.0.0", port=5000, debug=False, use_reloader=False)
|
||||
|
|
@ -0,0 +1,148 @@
|
|||
import asyncio
|
||||
import aiohttp
|
||||
import json
|
||||
import sys
|
||||
import os
|
||||
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
|
||||
from config import api_info
|
||||
import asyncio
|
||||
import aiohttp
|
||||
import json
|
||||
from config import api_info
|
||||
from collections import defaultdict
|
||||
# ======================
|
||||
# 调用大模型 API
|
||||
# ======================
|
||||
async def call_model_api(prompt):
|
||||
"""
|
||||
异步调用 Nuwa Chat Completions API 返回文本输出
|
||||
"""
|
||||
url = f"{api_info['base_url']}/chat/completions"
|
||||
headers = {
|
||||
"Content-Type": "application/json",
|
||||
"Authorization": f"Bearer {api_info['api_key']}"
|
||||
}
|
||||
payload = {
|
||||
"model": api_info["model"],
|
||||
"messages": [{"role": "user", "content": prompt}],
|
||||
"max_output_tokens": 1024
|
||||
}
|
||||
|
||||
async with aiohttp.ClientSession() as session:
|
||||
try:
|
||||
async with session.post(url, headers=headers, json=payload, timeout=60) as resp:
|
||||
if resp.status == 200:
|
||||
result = await resp.json()
|
||||
# 获取模型输出
|
||||
text = result.get("choices", [{}])[0].get("message", {}).get("content", "")
|
||||
return text
|
||||
else:
|
||||
print(f"[ERROR] 请求失败: {resp.status} {await resp.text()}")
|
||||
return ""
|
||||
except Exception as e:
|
||||
print(f"[ERROR] 请求异常: {e}")
|
||||
return ""
|
||||
|
||||
# ======================
|
||||
# 异步解析每篇论文
|
||||
# ======================
|
||||
async def parse_paper(paper):
|
||||
title = paper.get("title") or paper.get("Conference", "")
|
||||
summary = paper.get("summary", "")
|
||||
keywords = paper.get("keywords", [])
|
||||
|
||||
# Prompt 完整规范
|
||||
model_prompt = f"""
|
||||
你是一个科研助手,请根据以下信息分析论文内容,并提炼关键信息总结成 JSON 格式。要求:
|
||||
1. 输出 JSON 格式,字段包含:
|
||||
- background: 论文背景,简明说明研究动机和问题,不抄原文摘要。
|
||||
- objective: 研究目标,逻辑上支撑方法和贡献,如果有多个目标,每条编号从 1 开始,如 "1. …", "2. …"。
|
||||
- method: 研究方法,说明论文如何实现目标,逻辑上与目标和贡献连贯,如果有多条方法,每条编号从 1 开始。
|
||||
- results: 核心结论,概括论文主要结果。
|
||||
- contribution: 论文贡献总结,总结通过方法解决目标得到的价值与创新点,如果有多条贡献,每条编号从 1 开始。
|
||||
2. **要求分析提炼,而非复述原文摘要**:
|
||||
- 用你自己的理解重组信息
|
||||
- 确保逻辑顺序:objective → method → contribution
|
||||
- 精炼、一针见血,但保持完整信息
|
||||
|
||||
3. 如果某一项无法从信息中提取,请置空 ""。
|
||||
4. 输出 JSON 时严格遵循字段名称,不添加额外解释文字。
|
||||
|
||||
示例输入:
|
||||
Title: Analyzing the Basic Elements of Mobile Viral Marketing-An Empirical Study
|
||||
Summary: As personal communication tools mobile devices are platforms for word-of-mouth marketing. Given the assigned usefulness of mobile viral marketing, it is surprising to find relatively few studies directed at its basic elements, i.e., mobile viral content and consumers forwarding this content. The paper presents the findings of an online survey conducted to empirically investigate the consumers' intention to participate in different kinds of mobile viral marketing strategies and to identify the characteristics of mobile viral mavens in terms of their forwarding behaviour.
|
||||
Keywords: mobile marketing, viral marketing, consumer behavior
|
||||
|
||||
示例输出:
|
||||
{{
|
||||
"background": "移动设备为口碑传播提供了新渠道,但关于病毒营销基本元素的研究仍较少。",
|
||||
"objective": "1. 分析移动病毒营销的核心组成及消费者转发行为,理解不同策略对参与意向的影响。",
|
||||
"method": "2. 设计并实施在线问卷调查,收集消费者行为数据,并进行实证分析以验证策略效果。",
|
||||
"results": "发现消费者对不同类型的移动病毒营销策略表现出不同的参与意向。",
|
||||
"contribution": "3. 提炼移动病毒营销的关键元素及转发行为模式,为营销策略优化提供参考。"
|
||||
}}
|
||||
|
||||
现在请根据以下信息生成 JSON:
|
||||
Title: {title}
|
||||
Summary: {summary}
|
||||
Keywords: {', '.join(keywords)}
|
||||
"""
|
||||
|
||||
try:
|
||||
model_output = await call_model_api(model_prompt)
|
||||
parsed = json.loads(model_output) if model_output else {
|
||||
"background": "",
|
||||
"objective": "",
|
||||
"method": "",
|
||||
"results": "",
|
||||
"contribution": ""
|
||||
}
|
||||
except Exception:
|
||||
parsed = {
|
||||
"background": "",
|
||||
"objective": "",
|
||||
"method": "",
|
||||
"results": "",
|
||||
"contribution": ""
|
||||
}
|
||||
|
||||
paper_parsed = paper.copy()
|
||||
paper_parsed["parsed_summary"] = parsed
|
||||
return paper_parsed
|
||||
|
||||
|
||||
async def parse_ieee_results_all_categories_async(json_data):
|
||||
"""
|
||||
解析抓取结果的所有分类,并且去重重复文章(title相同的只解析一次)。
|
||||
使用缓存机制避免重复解析同一篇文章。
|
||||
"""
|
||||
# 支持传入两种格式
|
||||
results = json_data.get("results", {}) if "results" in json_data else json_data
|
||||
|
||||
parsed_results = defaultdict(list) # 最终返回结果,分类对应列表
|
||||
seen_titles = set() # 全局去重,防止重复解析
|
||||
cache = {} # 缓存已解析文章:title -> 解析后的数据
|
||||
|
||||
for category, papers in results.items():
|
||||
tasks = []
|
||||
for paper in papers:
|
||||
title = paper.get("title") or paper.get("Conference", "")
|
||||
if title in seen_titles:
|
||||
# 已解析过,直接复用缓存
|
||||
tasks.append(asyncio.sleep(0, result=cache[title]))
|
||||
else:
|
||||
# 新文章,加入任务并记录
|
||||
seen_titles.add(title)
|
||||
task = asyncio.create_task(parse_paper(paper))
|
||||
tasks.append(task)
|
||||
|
||||
if tasks:
|
||||
parsed_papers = await asyncio.gather(*tasks)
|
||||
# 保存到缓存并添加到对应分类
|
||||
for parsed_paper in parsed_papers:
|
||||
t = parsed_paper.get("title") or parsed_paper.get("Conference", "")
|
||||
cache[t] = parsed_paper
|
||||
parsed_results[category].append(parsed_paper)
|
||||
|
||||
return dict(parsed_results)
|
||||
|
|
@ -0,0 +1,34 @@
|
|||
1.安装浏览器(命令如下)
|
||||
|
||||
wget https://dl.google.com/linux/direct/google-chrome-stable_current_amd64.deb
|
||||
sudo apt install ./google-chrome-stable_current_amd64.deb -y
|
||||
|
||||
2.下载成功之后,输入命令
|
||||
|
||||
google-chrome --version
|
||||
# 输出示例:Google Chrome 140.0.7339.80
|
||||
|
||||
3.下载对应 ChromeDriver:
|
||||
访问网站 https://googlechromelabs.github.io/chrome-for-testing/
|
||||
根据上一步google-chrome输出的版本号信息,选择相同的版本 示例如下:
|
||||
wget https://storage.googleapis.com/chrome-for-testing-public/140.0.7339.80/linux64/chromedriver-linux64.zip
|
||||
unzip chromedriver_linux64.zip
|
||||
cd chromedriver_linux64
|
||||
sudo mv chromedriver /usr/local/bin/
|
||||
sudo chmod +x /usr/local/bin/chromedriver
|
||||
然后验证
|
||||
chromedriver --version
|
||||
# 输出示例:ChromeDriver 140.0.7339.80 (670b6f192f4668d2ac2c06bd77ec3e4eeda7d648-refs/branch-heads/7339_41@{#3})
|
||||
|
||||
4.下载这个xvfb-run,运行示例
|
||||
xvfb-run python main.py
|
||||
|
||||
|
||||
|
||||
第一步构建镜像
|
||||
docker build -t selenium:latest .
|
||||
第二步启动容器:
|
||||
docker run -d \
|
||||
--name selenium-container \
|
||||
-p 5001:5000 \
|
||||
selenium
|
|
@ -0,0 +1,7 @@
|
|||
Flask
|
||||
flask-cors
|
||||
aiohttp
|
||||
requests
|
||||
beautifulsoup4
|
||||
selenium
|
||||
lxml
|
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
|
@ -0,0 +1,244 @@
|
|||
# coding=utf-8
|
||||
import csv
|
||||
import json
|
||||
import sys
|
||||
import os
|
||||
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
|
||||
import random
|
||||
import time
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
import traceback
|
||||
from config import create_browser,_scroll_into_view
|
||||
from selenium.webdriver import ActionChains
|
||||
from selenium.webdriver.common.by import By
|
||||
from selenium.webdriver.support.select import Select
|
||||
from selenium.webdriver.support.ui import WebDriverWait
|
||||
from selenium.webdriver.support import expected_conditions as EC
|
||||
from selenium.common.exceptions import (
|
||||
TimeoutException,
|
||||
NoSuchElementException,
|
||||
)
|
||||
|
||||
# coding=utf-8
|
||||
import csv
|
||||
import json
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
import os
|
||||
import random
|
||||
import time
|
||||
import traceback
|
||||
from config import create_browser,_scroll_into_view
|
||||
from selenium.webdriver import ActionChains
|
||||
from selenium.webdriver.common.by import By
|
||||
from selenium.webdriver.support.select import Select
|
||||
from selenium.webdriver.support.ui import WebDriverWait
|
||||
from selenium.webdriver.support import expected_conditions as EC
|
||||
from selenium.common.exceptions import (
|
||||
TimeoutException,
|
||||
NoSuchElementException,
|
||||
)
|
||||
|
||||
def extract_row_info(row, driver):
|
||||
"""抓取单条记录信息并返回字典"""
|
||||
_scroll_into_view(driver, row)
|
||||
# 原始链接
|
||||
try:
|
||||
originalElem = row.find_element(By.CSS_SELECTOR, "div > p > a")
|
||||
originalLink=originalElem.get_attribute("href")
|
||||
except Exception:
|
||||
originalLink = ""
|
||||
# 标题
|
||||
try:
|
||||
title = row.find_element(By.CSS_SELECTOR, "p.title.is-5.mathjax").text.strip()
|
||||
except Exception:
|
||||
title = ""
|
||||
# 作者
|
||||
try:
|
||||
authors = [a.text.strip() for a in
|
||||
row.find_element(By.CSS_SELECTOR, "p.authors").find_elements(By.TAG_NAME, "a")]
|
||||
except Exception:
|
||||
authors = []
|
||||
|
||||
|
||||
|
||||
# 提取时间
|
||||
try:
|
||||
info_p = row.find_element(By.CSS_SELECTOR, "p.is-size-7").text
|
||||
date = ""
|
||||
for part in info_p.split(";"):
|
||||
if "Submitted" in part:
|
||||
date = part.replace("Submitted", "").strip()
|
||||
break
|
||||
except Exception:
|
||||
date = ""
|
||||
print("原文url:", originalLink)
|
||||
print("论文标题:", title)
|
||||
print("作者列表:", authors)
|
||||
|
||||
print("提交时间:", date)
|
||||
time.sleep(1)
|
||||
try:
|
||||
# 点击打开新页面
|
||||
originalElem.click()
|
||||
time.sleep(2)
|
||||
# pdf链接
|
||||
try:
|
||||
pdf_link_elem = WebDriverWait(driver, 10).until(
|
||||
EC.presence_of_element_located((By.CSS_SELECTOR, "a.abs-button.download-pdf"))
|
||||
)
|
||||
# 获取 href
|
||||
pdf_link = pdf_link_elem.get_attribute("href")
|
||||
except Exception:
|
||||
pdf_link = ""
|
||||
print("PDF 链接:", pdf_link)
|
||||
# 获取摘要的 innerHTML
|
||||
abstract_elem = driver.find_element(By.CSS_SELECTOR, "blockquote.abstract.mathjax")
|
||||
html_text = abstract_elem.get_attribute("innerHTML").replace("<br>", "\n").strip()
|
||||
|
||||
# 使用 BeautifulSoup 去除所有标签
|
||||
soup = BeautifulSoup(html_text, "html.parser")
|
||||
summary_text = soup.get_text().strip()
|
||||
|
||||
except Exception as e:
|
||||
pdf_link = ""
|
||||
summary_text = ""
|
||||
print("[错误] 获取摘要失败:", e)
|
||||
|
||||
finally:
|
||||
# 回退
|
||||
try:
|
||||
driver.back()
|
||||
time.sleep(1)
|
||||
except Exception as e:
|
||||
print("[警告] 页面回退失败:", e)
|
||||
print("摘要:", summary_text)
|
||||
time.sleep(1)
|
||||
|
||||
return {
|
||||
"title": title, # 确保在函数内有定义
|
||||
"author": authors,
|
||||
"site":"arxiv",
|
||||
"originalLink": originalLink,
|
||||
"pdfUrl": pdf_link,
|
||||
"date":date,
|
||||
"summary": summary_text
|
||||
}
|
||||
def crawl_current_sort(driver, limit):
|
||||
"""抓取当前排序下的 limit 条记录"""
|
||||
fetched_count = 0
|
||||
results = []
|
||||
|
||||
while fetched_count < limit:
|
||||
try:
|
||||
WebDriverWait(driver, 10).until(
|
||||
EC.presence_of_element_located(
|
||||
(By.CSS_SELECTOR, '#main-container > div.content > ol > li:nth-child(1) > div > p > a'))
|
||||
)
|
||||
except TimeoutException:
|
||||
print("[警告] 本页结果表格未出现,尝试继续")
|
||||
time.sleep(2)
|
||||
|
||||
rows = driver.find_elements(By.CSS_SELECTOR, '#main-container > div.content > ol > li')
|
||||
for idx, row in enumerate(rows, 1):
|
||||
|
||||
if fetched_count >= limit:
|
||||
break
|
||||
try:
|
||||
info = extract_row_info(row, driver)
|
||||
if isinstance(info, dict):
|
||||
results.append(info)
|
||||
fetched_count += 1
|
||||
time.sleep(random.uniform(0.5, 1.2))
|
||||
except Exception as e:
|
||||
print(f"[错误] {e}")
|
||||
traceback.print_exc()
|
||||
try:
|
||||
if driver.window_handles:
|
||||
driver.switch_to.window(driver.window_handles[0])
|
||||
except Exception:
|
||||
pass
|
||||
if fetched_count >= limit:
|
||||
break
|
||||
# 翻页
|
||||
try:
|
||||
# 尝试定位唯一的下一页按钮(支持不同分页情况)
|
||||
next_btn = driver.find_element(
|
||||
By.CSS_SELECTOR,
|
||||
"#main-container > div.content > nav:nth-child(3) > a.pagination-next" # 修复了多余空格
|
||||
)
|
||||
# 判断是否超过限制
|
||||
if not next_btn.is_enabled() :
|
||||
break
|
||||
_scroll_into_view(driver, next_btn)
|
||||
try:
|
||||
next_btn.click()
|
||||
except Exception:
|
||||
try:
|
||||
ActionChains(driver).move_to_element(next_btn).pause(0.1).click(next_btn).perform()
|
||||
except Exception:
|
||||
driver.execute_script("arguments[0].click();", next_btn)
|
||||
print("进入下一页")
|
||||
time.sleep(random.uniform(1, 1.5))
|
||||
except Exception:
|
||||
print("[INFO] 已到最后一页或翻页失败")
|
||||
break
|
||||
return results
|
||||
def arxiv(keyword, limit, sort_options=None):
|
||||
|
||||
driver = create_browser()
|
||||
wait = WebDriverWait(driver, 15)
|
||||
all_results = {}
|
||||
if not sort_options:
|
||||
sort_options = ["publication_time"] # 默认时间
|
||||
try:
|
||||
driver.get("https://arxiv.org/")
|
||||
wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, "#header > div.search-block.level-right > form > div > div:nth-child(1) > input"))).send_keys(keyword)
|
||||
driver.find_element(By.CSS_SELECTOR, "#header > div.search-block.level-right > form > div > button").click()
|
||||
time.sleep(5)
|
||||
|
||||
for sort_name in sort_options:
|
||||
if sort_name == "publication_time":
|
||||
print("[INFO] 使用时间排序(默认)")
|
||||
# Arxiv 默认就是时间,不需要额外点击
|
||||
pass
|
||||
|
||||
elif sort_name == "relevance":
|
||||
print("[INFO] 切换到最新排序")
|
||||
try:
|
||||
# 点击排序下拉框
|
||||
order_select_elem = WebDriverWait(driver, 10).until(
|
||||
EC.presence_of_element_located((By.ID, "order"))
|
||||
)
|
||||
order_select = Select(order_select_elem)
|
||||
time.sleep(1)
|
||||
target_text = "Relevance"
|
||||
for option in order_select.options:
|
||||
if option.text.strip().lower() == target_text.lower():
|
||||
order_select.select_by_value(option.get_attribute("value"))
|
||||
print(f"已选择排序: {option.text} -> {option.get_attribute('value')}")
|
||||
break
|
||||
time.sleep(2)
|
||||
except Exception as e:
|
||||
print(f"[WARN] 切换最新排序失败: {e}")
|
||||
|
||||
# 抓取当前排序的结果
|
||||
results = crawl_current_sort(driver, limit)
|
||||
all_results[sort_name] = results
|
||||
|
||||
finally:
|
||||
try:
|
||||
driver.quit()
|
||||
except Exception:
|
||||
pass
|
||||
print("[DONE] arxiv 抓取完成")
|
||||
|
||||
print(json.dumps(all_results, ensure_ascii=False, indent=2))
|
||||
return all_results
|
||||
|
||||
if __name__ == '__main__':
|
||||
keyword = "graphrag"
|
||||
limit = 100
|
||||
arxiv(keyword, limit, ["relevance"])
|
||||
|
|
@ -0,0 +1,266 @@
|
|||
# coding=utf-8
|
||||
import csv
|
||||
import json
|
||||
import time
|
||||
import random
|
||||
import traceback
|
||||
import sys
|
||||
import os
|
||||
from bs4 import BeautifulSoup
|
||||
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
|
||||
from selenium.webdriver.common.by import By
|
||||
from selenium.webdriver.support.select import Select
|
||||
from selenium.webdriver.support.ui import WebDriverWait
|
||||
from selenium.webdriver.support import expected_conditions as EC
|
||||
from selenium.common.exceptions import (
|
||||
NoSuchElementException,
|
||||
TimeoutException,
|
||||
ElementClickInterceptedException
|
||||
)
|
||||
from selenium.webdriver.common.action_chains import ActionChains
|
||||
from config import create_browser,_scroll_into_view
|
||||
import json
|
||||
import re
|
||||
import requests
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
def get_abstract_in_new_tab(url, headers=None, timeout=100):
|
||||
"""
|
||||
通过论文详情页 URL 获取摘要(无需 Selenium)
|
||||
"""
|
||||
if headers is None:
|
||||
headers = {
|
||||
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/140.0.0.0 Safari/537.36"
|
||||
}
|
||||
|
||||
try:
|
||||
resp = requests.get(url, headers=headers, timeout=timeout)
|
||||
resp.raise_for_status()
|
||||
except requests.RequestException as e:
|
||||
print(f"[ERROR] 请求页面失败: {e}")
|
||||
return ""
|
||||
|
||||
# 使用 BeautifulSoup 解析页面
|
||||
soup = BeautifulSoup(resp.text, "lxml")
|
||||
|
||||
# 从 JavaScript 元数据中提取摘要
|
||||
match = re.search(r"xplGlobal\.document\.metadata\s*=\s*(\{.*?\});", resp.text, re.S)
|
||||
if match:
|
||||
metadata_json = match.group(1)
|
||||
try:
|
||||
metadata = json.loads(metadata_json)
|
||||
abstract = metadata.get("abstract", "")
|
||||
except json.JSONDecodeError:
|
||||
abstract = ""
|
||||
else:
|
||||
abstract = ""
|
||||
|
||||
return abstract
|
||||
|
||||
# ---------主函数 ---------
|
||||
def extract_row_info(row, driver):
|
||||
"""抓取单条记录信息并返回字典"""
|
||||
try:
|
||||
urlIndex = row.find_element(By.CLASS_NAME, "fw-bold")
|
||||
relative_link = urlIndex.get_attribute("href")
|
||||
title = urlIndex.text.strip()
|
||||
|
||||
base_url = "https://ieeexplore.ieee.org"
|
||||
if relative_link.startswith("/"):
|
||||
originalLink = base_url + relative_link
|
||||
else:
|
||||
originalLink = relative_link
|
||||
except Exception as e:
|
||||
print(f"[WARN] 获取论文标题或链接失败: {e}")
|
||||
title = ""
|
||||
originalLink = ""
|
||||
print("论文标题:", title)
|
||||
print("论文链接:", originalLink)
|
||||
|
||||
try:
|
||||
authors = [a.text for a in row.find_elements(By.CSS_SELECTOR, 'xpl-authors-name-list a span')]
|
||||
authors = [a for a in authors if a.strip()]
|
||||
except Exception as e:
|
||||
print(f"[WARN] 获取作者列表失败: {e}")
|
||||
authors = []
|
||||
print("作者列表:", authors)
|
||||
|
||||
try:
|
||||
Conference = row.find_element(By.CSS_SELECTOR, "a[xplhighlight]").text.strip()
|
||||
print("Conference:", Conference)
|
||||
except:
|
||||
print("未找到会议信息")
|
||||
|
||||
try:
|
||||
info_text = row.find_element(By.CSS_SELECTOR, "div.publisher-info-container").text
|
||||
# info_text 类似 "Year: 2025 | Conference Paper | Publisher: IEEE"
|
||||
parts = [p.strip() for p in info_text.split('|')]
|
||||
date = parts[0].replace("Year:", "").strip()
|
||||
paper_type = parts[1] if len(parts) > 1 else ""
|
||||
print(f"年份: {date}, 类型: {paper_type}")
|
||||
except:
|
||||
date = ""
|
||||
paper_type = ""
|
||||
print("未找到年份或类型")
|
||||
|
||||
time.sleep(1)
|
||||
|
||||
abstract = get_abstract_in_new_tab(originalLink)
|
||||
print("摘要:", abstract)
|
||||
time.sleep(2) # 等待页面加载完成
|
||||
return {
|
||||
"title": title, # 确保函数里有定义
|
||||
"originalLink": originalLink,
|
||||
"author": authors,
|
||||
"type":paper_type,
|
||||
"Conference":Conference,
|
||||
"date": date,
|
||||
"site":"ieeeXplore",
|
||||
"summary": abstract
|
||||
}
|
||||
def crawl_current_sort(driver, limit):
|
||||
"""抓取当前排序下的 limit 条记录"""
|
||||
fetched_count = 0
|
||||
results = []
|
||||
|
||||
while fetched_count < limit:
|
||||
try:
|
||||
WebDriverWait(driver, 10).until(
|
||||
EC.presence_of_element_located((By.CSS_SELECTOR, '#LayoutWrapper > div > div > div.stats-search-page.xpl-serp.ng2-app > div > xpl-root > header > xpl-header > div > xpl-navbar > div > div.top-navbar > div.left-side-container > div > div.xplore-logo-wrapper > xpl-xplore-logo > div > a > img'))
|
||||
)
|
||||
except TimeoutException:
|
||||
print("[警告] 本页结果表格未出现,尝试继续")
|
||||
time.sleep(2)
|
||||
|
||||
rows = driver.find_elements(By.CLASS_NAME, 'List-results-items')
|
||||
print(f'有多少条论文信息:{len(rows)}')
|
||||
for i in range(len(rows)):
|
||||
print(f'第{i+1}条')
|
||||
row = driver.find_elements(By.CLASS_NAME, 'List-results-items')[i]
|
||||
if fetched_count >= limit:
|
||||
break
|
||||
try:
|
||||
info = extract_row_info(row, driver)
|
||||
if info: # 只有 info 有效才追加
|
||||
results.append(info)
|
||||
fetched_count += 1
|
||||
time.sleep(2)
|
||||
except Exception as e:
|
||||
print(f"[错误] 抓取 row 失败: {e}")
|
||||
traceback.print_exc()
|
||||
# 如果窗口还存在,强制切回主窗口,避免死锁
|
||||
try:
|
||||
if driver.window_handles:
|
||||
driver.switch_to.window(driver.window_handles[0])
|
||||
except Exception:
|
||||
pass
|
||||
continue # 遇错后继续下一个 row,不要影响整体
|
||||
|
||||
# 翻页
|
||||
try:
|
||||
next_btn = driver.find_element(By.CSS_SELECTOR,"#xplMainContent > div.ng-SearchResults.row.g-0 > div.col > xpl-paginator > div.pagination-bar.hide-mobile.text-base-md-lh > ul > li.next-btn > button")
|
||||
if next_btn.is_enabled() and fetched_count < limit:
|
||||
_scroll_into_view(driver, next_btn)
|
||||
try:
|
||||
next_btn.click()
|
||||
except Exception:
|
||||
try:
|
||||
ActionChains(driver).move_to_element(next_btn).pause(1).click(next_btn).perform()
|
||||
except Exception:
|
||||
driver.execute_script("arguments[0].click();", next_btn)
|
||||
print("进入下一页")
|
||||
time.sleep(random.uniform(1, 2))
|
||||
else:
|
||||
break
|
||||
except NoSuchElementException:
|
||||
print("[INFO] 已到最后一页或翻页不存在")
|
||||
break
|
||||
except Exception as e:
|
||||
print(f"[错误] 翻页失败: {e}")
|
||||
break
|
||||
|
||||
return results
|
||||
def ieeeXplore(keyword, limit,sort_options=None):
|
||||
"""主函数:三种排序抓取"""
|
||||
driver = create_browser()
|
||||
wait = WebDriverWait(driver, 15)
|
||||
all_results = {}
|
||||
if not sort_options:
|
||||
sort_options = ["relevance"] # 默认相关性
|
||||
|
||||
try:
|
||||
driver.get("https://ieeexplore.ieee.org/Xplore/home.jsp")
|
||||
|
||||
try:
|
||||
accept_btn = driver.find_element(By.CSS_SELECTOR, "button.osano-cm-accept-all")
|
||||
accept_btn.click()
|
||||
print("有弹窗,点击已点击全部接受按钮")
|
||||
# 等待弹窗消失
|
||||
WebDriverWait(driver, 10).until(
|
||||
EC.invisibility_of_element(accept_btn)
|
||||
)
|
||||
except NoSuchElementException:
|
||||
# 没有弹窗就继续
|
||||
print("未检测到全部接受按钮")
|
||||
|
||||
input_box = wait.until(EC.presence_of_element_located(
|
||||
(By.CSS_SELECTOR, "input.Typeahead-input")
|
||||
))
|
||||
input_box.clear()
|
||||
input_box.send_keys(keyword)
|
||||
# 定位搜索按钮
|
||||
search_btn = driver.find_element(By.CSS_SELECTOR, "button.fa.fa-search.stats-Global_Search_Icon")
|
||||
search_btn.click()
|
||||
time.sleep(4)
|
||||
for sort_name in sort_options:
|
||||
try:
|
||||
if sort_name == "publication_time":
|
||||
print("[INFO] 使用时间进行排序")
|
||||
# 定位下拉按钮
|
||||
dropdown_btn = wait.until(
|
||||
EC.presence_of_element_located((By.CSS_SELECTOR, "button.dropdown-toggle.xpl-btn-secondary"))
|
||||
)
|
||||
# 滚动到下拉按钮可见
|
||||
driver.execute_script("arguments[0].scrollIntoView(true);", dropdown_btn)
|
||||
# 用 JS 点击下拉按钮(避免 ElementNotInteractable)
|
||||
driver.execute_script("arguments[0].click();", dropdown_btn)
|
||||
# 等待 0.5~1 秒让下拉选项渲染
|
||||
time.sleep(1)
|
||||
# 定位 "Newest" 选项
|
||||
newest_option = wait.until(
|
||||
EC.presence_of_element_located(
|
||||
(By.XPATH,
|
||||
"//button[contains(@class,'dropdown-item') and contains(normalize-space(.),'Newest')]")
|
||||
)
|
||||
)
|
||||
# 用 JS 点击选项
|
||||
driver.execute_script("arguments[0].click();", newest_option)
|
||||
time.sleep(2)
|
||||
elif sort_name == "relevance":
|
||||
print("[INFO] 使用相关性排序(默认)")
|
||||
|
||||
except Exception as e:
|
||||
print(f"[WARN] 点击排序 {sort_name} 失败:", e)
|
||||
|
||||
# 抓取当前排序下的结果
|
||||
results = crawl_current_sort(driver, limit)
|
||||
all_results[sort_name] = results
|
||||
|
||||
finally:
|
||||
try:
|
||||
driver.quit()
|
||||
except Exception:
|
||||
pass
|
||||
print("[DONE] PDF处理完成")
|
||||
print(json.dumps(all_results, ensure_ascii=False, indent=2))
|
||||
|
||||
return all_results
|
||||
|
||||
if __name__ == '__main__':
|
||||
keyword = "bert"
|
||||
limit=100
|
||||
# ieeeXplore(keyword, limit, ["relevance"])
|
||||
# 搜最新
|
||||
# ieeeXplore(keyword, limit, ["publication_time"])
|
||||
# 先相关性再最新
|
||||
ieeeXplore(keyword, limit, ["relevance", "publication_time"])
|
|
@ -0,0 +1,253 @@
|
|||
# coding=utf-8
|
||||
import csv
|
||||
import json
|
||||
import time
|
||||
import sys
|
||||
import os
|
||||
|
||||
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
|
||||
import random
|
||||
import traceback
|
||||
from selenium.webdriver.common.by import By
|
||||
from selenium.webdriver.support.select import Select
|
||||
from selenium.webdriver.support.ui import WebDriverWait
|
||||
from selenium.webdriver.support import expected_conditions as EC
|
||||
from selenium.common.exceptions import (
|
||||
TimeoutException, NoSuchElementException,
|
||||
)
|
||||
from selenium.webdriver.common.action_chains import ActionChains
|
||||
from config import create_browser,_scroll_into_view
|
||||
# ---------主函数 ---------
|
||||
def extract_row_info(row, driver):
|
||||
"""抓取单条记录信息并返回字典"""
|
||||
try:
|
||||
url_elem = row.find_element(By.CSS_SELECTOR, "a.docsum-title")
|
||||
title = url_elem.text.strip()
|
||||
originalLink = url_elem.get_attribute("href")
|
||||
except Exception as e:
|
||||
title = ""
|
||||
originalLink = ""
|
||||
print("[错误] 获取论文标题或链接失败:", e)
|
||||
#抓取作者以及引用信息
|
||||
try:
|
||||
authors = row.find_element(By.XPATH,
|
||||
".//span[contains(@class,'docsum-authors') and contains(@class,'full-authors')]").text
|
||||
citation = row.find_element(By.XPATH,
|
||||
".//span[contains(@class,'docsum-journal-citation') and contains(@class,'full-journal-citation')]").text
|
||||
except:
|
||||
citation = ""
|
||||
authors = ""
|
||||
print("论文原处:",originalLink)
|
||||
print("论文标题:", title)
|
||||
print("作者列表:", authors)
|
||||
print("论文引用:", citation)
|
||||
|
||||
try:
|
||||
downloadElem = row.find_element(By.XPATH,".//div[contains(@class,'docsum-citation') and contains(@class,'full-citation')]")
|
||||
downloadText = downloadElem.text
|
||||
except:
|
||||
downloadText = ""
|
||||
time.sleep(1)
|
||||
url_elem.click()
|
||||
time.sleep(3) # 等待加载
|
||||
# 摘要
|
||||
try:
|
||||
abstract_elem = driver.find_element(By.CSS_SELECTOR, "#eng-abstract p")
|
||||
abstract_text = abstract_elem.text.strip()
|
||||
except NoSuchElementException:
|
||||
abstract_text = ""
|
||||
print("摘要:", abstract_text)
|
||||
# 关键词(可能没有)
|
||||
try:
|
||||
keyword_elem = driver.find_element(By.CSS_SELECTOR, "#abstract > p")
|
||||
keyword_text = keyword_elem.text.replace("Keywords:", "").strip()
|
||||
except NoSuchElementException:
|
||||
keyword_text = ""
|
||||
print("关键词:", keyword_text)
|
||||
pdf_url = ""
|
||||
if "Free PMC article" in downloadText:
|
||||
print("✅ 该文章是免费文章,可以下载")
|
||||
original_handle = driver.current_window_handle
|
||||
original_handles = driver.window_handles.copy()
|
||||
|
||||
# --- 点击下载按钮 ---
|
||||
print("[步骤] 跳转下载界面 ...")
|
||||
|
||||
try:
|
||||
pdf_selector = WebDriverWait(driver, 5).until(
|
||||
EC.presence_of_element_located(
|
||||
(By.XPATH, "//a[contains(@class,'link-item') and contains(@class,'pmc')]"))
|
||||
)
|
||||
except Exception:
|
||||
print("❌ 找不到 PMC PDF 链接,跳过下载")
|
||||
pdf_selector = None
|
||||
if pdf_selector:
|
||||
|
||||
try:
|
||||
pdf_selector.click()
|
||||
except Exception:
|
||||
driver.execute_script("arguments[0].click();", pdf_selector)
|
||||
print("[步骤] 点击完成,等待新窗口/页面...")
|
||||
|
||||
# --- 判断是否有新窗口 ---
|
||||
try:
|
||||
WebDriverWait(driver, 5).until(lambda d: len(d.window_handles) > len(original_handles))
|
||||
new_handle = [h for h in driver.window_handles if h not in original_handles][0]
|
||||
driver.switch_to.window(new_handle)
|
||||
print("[步骤] 已切换到新窗口:", new_handle)
|
||||
except TimeoutException:
|
||||
print("[步骤] 没有新窗口,在当前窗口继续处理。")
|
||||
|
||||
time.sleep(1)
|
||||
|
||||
# --- 切换后重新查找 PDF 元素 ---
|
||||
try:
|
||||
print("[步骤] 等待 PDF 按钮出现...")
|
||||
pdf_a = WebDriverWait(driver, 10).until(
|
||||
EC.presence_of_element_located(
|
||||
(By.XPATH, "//a[contains(@class,'usa-button') and contains(@href,'pdf/')]")
|
||||
)
|
||||
)
|
||||
pdf_url = pdf_a.get_attribute("href")
|
||||
if pdf_url:
|
||||
print("📄 PDF 链接:", pdf_url)
|
||||
|
||||
except Exception as e:
|
||||
print("❌ 获取 PDF 失败:", e)
|
||||
|
||||
finally:
|
||||
# --- 关闭并切回原窗口 ---
|
||||
current = driver.current_window_handle
|
||||
if current != original_handle:
|
||||
driver.close()
|
||||
driver.switch_to.window(original_handle)
|
||||
time.sleep(1)
|
||||
print("[步骤] 已切回原窗口。")
|
||||
|
||||
else:
|
||||
print("❌ 该文章不是免费文章")
|
||||
|
||||
# 回退到上一级
|
||||
driver.back()
|
||||
time.sleep(2) # 等待页面加载完成
|
||||
|
||||
return {
|
||||
"title": title, # 确保函数里有定义
|
||||
"author": authors,
|
||||
"site":"pubmed",
|
||||
"originalLink":originalLink,
|
||||
"citation": citation,
|
||||
"pdfUrl": pdf_url,
|
||||
"keywords": keyword_text,
|
||||
"summary": abstract_text
|
||||
}
|
||||
def crawl_current_sort(driver, limit):
|
||||
"""抓取当前排序下的 limit 条记录"""
|
||||
fetched_count = 0
|
||||
results = []
|
||||
|
||||
while fetched_count < limit:
|
||||
try:
|
||||
WebDriverWait(driver, 10).until(
|
||||
EC.presence_of_element_located((By.CSS_SELECTOR, '#search-form > div.inner-wrap > a.pubmed-logo > img'))
|
||||
)
|
||||
except TimeoutException:
|
||||
print("[警告] 本页结果表格未出现,尝试继续")
|
||||
time.sleep(2)
|
||||
|
||||
rows = driver.find_elements(By.XPATH, '//*[@id="search-results"]/section/div[2]/div/article')
|
||||
|
||||
for row in rows:
|
||||
if fetched_count >= limit:
|
||||
break
|
||||
try:
|
||||
info = extract_row_info(row, driver)
|
||||
if info: # 只有 info 有效才追加
|
||||
results.append(info)
|
||||
fetched_count += 1
|
||||
time.sleep(2)
|
||||
except Exception as e:
|
||||
print(f"[错误] 抓取 row 失败: {e}")
|
||||
traceback.print_exc()
|
||||
# 如果窗口还存在,强制切回主窗口,避免死锁
|
||||
try:
|
||||
if driver.window_handles:
|
||||
driver.switch_to.window(driver.window_handles[0])
|
||||
except Exception:
|
||||
pass
|
||||
continue # 遇错后继续下一个 row,不要影响整体
|
||||
|
||||
# 翻页
|
||||
try:
|
||||
next_btn = driver.find_element(By.CSS_SELECTOR,"#search-results > div.top-wrapper > div.top-pagination > button.button-wrapper.next-page-btn > img.chevron-icon.enabled-icon")
|
||||
if next_btn.is_enabled() and fetched_count < limit:
|
||||
_scroll_into_view(driver, next_btn)
|
||||
try:
|
||||
next_btn.click()
|
||||
except Exception:
|
||||
try:
|
||||
ActionChains(driver).move_to_element(next_btn).pause(0.1).click(next_btn).perform()
|
||||
except Exception:
|
||||
driver.execute_script("arguments[0].click();", next_btn)
|
||||
time.sleep(random.uniform(1, 2))
|
||||
else:
|
||||
break
|
||||
except NoSuchElementException:
|
||||
print("[INFO] 已到最后一页或翻页不存在")
|
||||
break
|
||||
except Exception as e:
|
||||
print(f"[错误] 翻页失败: {e}")
|
||||
break
|
||||
|
||||
return results
|
||||
def pubmed(keyword, limit,sort_options=None):
|
||||
"""主函数:三种排序抓取"""
|
||||
driver = create_browser()
|
||||
wait = WebDriverWait(driver, 15)
|
||||
all_results = {}
|
||||
if not sort_options:
|
||||
sort_options = ["relevance"] # 默认相关性
|
||||
|
||||
try:
|
||||
driver.get("https://pubmed.ncbi.nlm.nih.gov/")
|
||||
wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, "#id_term"))).send_keys(keyword)
|
||||
time.sleep(1)
|
||||
driver.find_element(By.CSS_SELECTOR, "#search-form > div > div.search-input > div > button").click()
|
||||
time.sleep(4)
|
||||
for sort_name in sort_options:
|
||||
try:
|
||||
if sort_name == "publication_time":
|
||||
print("[INFO] 使用时间进行排序")
|
||||
# 等待下拉框出现
|
||||
sort_elem = driver.find_element(By.ID, "id_sort")
|
||||
sort_select = Select(sort_elem)
|
||||
sort_select.select_by_value("pubdate") # 或 select_by_visible_text("Publication date")
|
||||
time.sleep(2)
|
||||
elif sort_name == "relevance":
|
||||
print("[INFO] 使用相关性排序(默认)")
|
||||
|
||||
except Exception as e:
|
||||
print(f"[WARN] 点击排序 {sort_name} 失败:", e)
|
||||
|
||||
# 抓取当前排序下的结果
|
||||
results = crawl_current_sort(driver, limit)
|
||||
all_results[sort_name] = results
|
||||
|
||||
finally:
|
||||
try:
|
||||
driver.quit()
|
||||
except Exception:
|
||||
pass
|
||||
print("[DONE] PDF处理完成")
|
||||
print(json.dumps(all_results, ensure_ascii=False, indent=2))
|
||||
|
||||
return all_results
|
||||
|
||||
if __name__ == '__main__':
|
||||
keyword = "bert"
|
||||
limit=100
|
||||
# pubmed(keyword, limit, ["relevance"])
|
||||
# 搜最新
|
||||
# pubmed(keyword, limit, ["publication_time"])
|
||||
# 先相关性再最新
|
||||
pubmed(keyword, limit, ["relevance", "publication_time"])
|
|
@ -0,0 +1,209 @@
|
|||
# coding=utf-8
|
||||
import json
|
||||
import time
|
||||
import random
|
||||
import traceback
|
||||
from selenium.webdriver.common.by import By
|
||||
from selenium.webdriver.support.ui import WebDriverWait
|
||||
from selenium.webdriver.support import expected_conditions as EC
|
||||
from selenium.common.exceptions import (
|
||||
TimeoutException, NoSuchElementException,
|
||||
)
|
||||
from selenium.webdriver.common.action_chains import ActionChains
|
||||
from config import create_browser,_scroll_into_view
|
||||
def find_valid_detail_tab(driver, origin_handle, timeout=10):
|
||||
"""从现有句柄中挑出一个不是原始窗口的有效新标签页。"""
|
||||
end_time = time.time() + timeout
|
||||
while time.time() < end_time:
|
||||
for handle in driver.window_handles:
|
||||
if handle != origin_handle:
|
||||
try:
|
||||
driver.switch_to.window(handle)
|
||||
current_url = driver.current_url
|
||||
if not current_url.startswith("chrome://") and current_url != "about:blank":
|
||||
print(f"[切换窗口] 成功 → {driver.title}")
|
||||
return handle
|
||||
except Exception:
|
||||
pass
|
||||
time.sleep(0.5)
|
||||
raise Exception("未能在规定时间内找到有效详情页窗口")
|
||||
# ---------主函数 ---------
|
||||
def extract_row_info(row, driver):
|
||||
"""抓取单条记录信息并返回字典"""
|
||||
|
||||
try:
|
||||
type_text = row.find_element(By.XPATH, 'div / div / div[1] / span').text.strip()
|
||||
except Exception:
|
||||
type_text = ""
|
||||
# 如果不是期刊论文,直接跳过
|
||||
if type_text != "Research article":
|
||||
return None
|
||||
title_element = row.find_element(By.XPATH, './/h2/a/span/span/span')
|
||||
title = title_element.text.strip()
|
||||
print("论文名称",title)
|
||||
|
||||
try:
|
||||
# 提取期刊名
|
||||
journal_element = row.find_element(By.XPATH, './/div[@class="SubType hor text-xs u-clr-grey6"]//a')
|
||||
source = journal_element.text.strip()
|
||||
# 提取时间(在 srctitle-date-fields 里,期刊名后面的 span)
|
||||
time_element = row.find_element(By.XPATH,'.//div[@class="SubType hor text-xs u-clr-grey6"]//span[@class="srctitle-date-fields"]/span[last()]')
|
||||
date = time_element.text.strip()
|
||||
|
||||
except:
|
||||
source=""
|
||||
date=""
|
||||
print(f"未找到期刊或时间")
|
||||
|
||||
print(f"期刊: {source} | 时间: {date}")
|
||||
print("类型:", type_text)
|
||||
|
||||
time.sleep(1)
|
||||
origin = driver.current_window_handle
|
||||
existing_handles = driver.window_handles
|
||||
try:
|
||||
_scroll_into_view(driver, title_element)
|
||||
title_element.click()
|
||||
except Exception:
|
||||
try:
|
||||
ActionChains(driver).move_to_element(title_element).pause(1).click(title_element).perform()
|
||||
except Exception:
|
||||
driver.execute_script("arguments[0].click();", title_element)
|
||||
|
||||
try:
|
||||
WebDriverWait(driver, 10).until(EC.new_window_is_opened(existing_handles))
|
||||
except TimeoutException:
|
||||
print("[警告] 未检测到新窗口,跳过")
|
||||
return None
|
||||
|
||||
try:
|
||||
detail_tab = find_valid_detail_tab(driver, origin)
|
||||
if detail_tab not in driver.window_handles:
|
||||
return None
|
||||
driver.switch_to.window(detail_tab)
|
||||
time.sleep(3)
|
||||
#获取摘要信息
|
||||
abstract_elem = driver.find_elements(By.CSS_SELECTOR, "#sp0010")
|
||||
summary_text = abstract_elem.text.strip()
|
||||
print("摘要:", summary_text)
|
||||
authors = driver.find_elements(By.CSS_SELECTOR, "#author-group .react-xocs-alternative-link")
|
||||
|
||||
author_names = [a.text.strip() for a in authors if a.text.strip()]
|
||||
print(author_names)
|
||||
|
||||
time.sleep(1)
|
||||
finally:
|
||||
# 仅关闭非原窗口
|
||||
if driver.current_window_handle != origin:
|
||||
driver.close()
|
||||
driver.switch_to.window(origin)
|
||||
time.sleep(random.uniform(0.5, 1.5))
|
||||
|
||||
return {
|
||||
"title": title, # 确保函数里有定义
|
||||
"author": author_names,
|
||||
"source": source,
|
||||
"summary": summary_text
|
||||
}
|
||||
def crawl_current_sort(driver, limit):
|
||||
"""抓取当前排序下的 limit 条记录"""
|
||||
fetched_count = 0
|
||||
results = []
|
||||
|
||||
while fetched_count < limit:
|
||||
try:
|
||||
WebDriverWait(driver, 10).until(
|
||||
EC.presence_of_element_located((By.XPATH, '//*[@id="main_content"]/div[3]/div[1]/div[2]/div[2]'))
|
||||
)
|
||||
except TimeoutException:
|
||||
print("[警告] 本页结果表格未出现,尝试继续")
|
||||
time.sleep(2)
|
||||
|
||||
rows = driver.find_elements(By.XPATH, '// *[ @ id = "srp-results-list"] / ol / li')
|
||||
|
||||
for row in rows:
|
||||
if fetched_count >= limit:
|
||||
break
|
||||
try:
|
||||
info = extract_row_info(row, driver)
|
||||
if info:
|
||||
results.append(info)
|
||||
time.sleep(2)
|
||||
fetched_count += 1
|
||||
|
||||
except Exception as e:
|
||||
print(f"[错误] {e}")
|
||||
traceback.print_exc()
|
||||
try:
|
||||
if driver.window_handles:
|
||||
driver.switch_to.window(driver.window_handles[0])
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
# 翻页
|
||||
try:
|
||||
next_btn = driver.find_element(By.XPATH, "//*[@id='srp-pagination']/li[2]/a/span")
|
||||
|
||||
if not next_btn.is_enabled() or fetched_count >= limit:
|
||||
break
|
||||
_scroll_into_view(driver, next_btn)
|
||||
try:
|
||||
next_btn.click()
|
||||
except Exception:
|
||||
try:
|
||||
ActionChains(driver).move_to_element(next_btn).pause(1).click(next_btn).perform()
|
||||
except Exception:
|
||||
driver.execute_script("arguments[0].click();", next_btn)
|
||||
time.sleep(5)
|
||||
except Exception:
|
||||
print("[INFO] 已到最后一页或翻页失败")
|
||||
break
|
||||
|
||||
return results
|
||||
def scienceDirect(keyword, limit):
|
||||
"""主函数:三种排序抓取"""
|
||||
driver = create_browser()
|
||||
wait = WebDriverWait(driver, 15)
|
||||
all_results = {}
|
||||
|
||||
sortings = {
|
||||
"relevance": None,
|
||||
"publication_time": "#srp-sorting-options > div > a > span",
|
||||
}
|
||||
|
||||
try:
|
||||
driver.get("https://www.sciencedirect.com/")
|
||||
|
||||
search_input = WebDriverWait(driver, 10).until(
|
||||
EC.presence_of_element_located((By.ID, "qs"))
|
||||
)
|
||||
search_input.send_keys(keyword)
|
||||
time.sleep(2)
|
||||
search_button = driver.find_element(By.XPATH, '//*[@id="searchbar"]/div/div/form/div[2]/button')
|
||||
search_button.click()
|
||||
time.sleep(3)
|
||||
|
||||
for sort_name, css_selector in sortings.items():
|
||||
if css_selector:
|
||||
try:
|
||||
driver.find_element(By.CSS_SELECTOR, css_selector).click()
|
||||
time.sleep(5)
|
||||
except Exception:
|
||||
print(f"[WARN] 点击排序 {sort_name} 失败")
|
||||
|
||||
results = crawl_current_sort(driver, limit)
|
||||
all_results[sort_name] = results
|
||||
finally:
|
||||
try:
|
||||
driver.quit()
|
||||
except Exception:
|
||||
pass
|
||||
print("[DONE] PDF处理完成")
|
||||
print(json.dumps(all_results, ensure_ascii=False, indent=2))
|
||||
|
||||
return json.dumps(all_results, ensure_ascii=False, indent=2)
|
||||
|
||||
if __name__ == '__main__':
|
||||
keyword = "graphrag"
|
||||
limit=10
|
||||
scienceDirect(keyword,limit)
|
|
@ -0,0 +1,275 @@
|
|||
# coding=utf-8
|
||||
import csv
|
||||
import json
|
||||
import time
|
||||
import random
|
||||
import traceback
|
||||
import sys
|
||||
import os
|
||||
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
|
||||
from selenium.webdriver.common.by import By
|
||||
from selenium.webdriver.support.ui import WebDriverWait
|
||||
from selenium.webdriver.support import expected_conditions as EC
|
||||
from selenium.common.exceptions import (
|
||||
StaleElementReferenceException,
|
||||
TimeoutException, NoSuchElementException,
|
||||
)
|
||||
from selenium.webdriver.common.action_chains import ActionChains
|
||||
from config import create_browser,_scroll_into_view
|
||||
def click_element_safe(driver, locator, retries=3, wait_time=10):
|
||||
"""安全点击元素,防止 StaleElementReference"""
|
||||
for attempt in range(retries):
|
||||
try:
|
||||
elem = WebDriverWait(driver, wait_time).until(
|
||||
EC.element_to_be_clickable(locator)
|
||||
)
|
||||
elem.click()
|
||||
return elem
|
||||
except StaleElementReferenceException:
|
||||
print(f"StaleElementReferenceException, retry {attempt+1}/{retries}")
|
||||
time.sleep(0.5)
|
||||
raise Exception("点击失败,元素持续不可用")
|
||||
# ---------主函数 ---------
|
||||
def extract_row_info(row, driver):
|
||||
"""抓取单条记录信息并返回字典"""
|
||||
_scroll_into_view(driver, row)
|
||||
|
||||
# 类型
|
||||
try:
|
||||
type_elem = row.find_element(
|
||||
By.CSS_SELECTOR,
|
||||
'div.app-card-open__main > div.app-entitlement > div > div > span'
|
||||
)
|
||||
type_text = type_elem.text.strip()
|
||||
print("类型:", type_text)
|
||||
except Exception:
|
||||
return None
|
||||
|
||||
if type_text.lower() not in ["conference paper", "article"]:
|
||||
return None
|
||||
|
||||
# 标题
|
||||
try:
|
||||
title_element = row.find_element(By.CSS_SELECTOR, "div.app-card-open__main h3.app-card-open__heading a")
|
||||
title = title_element.text.strip()
|
||||
except Exception:
|
||||
return None
|
||||
|
||||
# 作者
|
||||
try:
|
||||
authors_elem = row.find_element(By.CSS_SELECTOR, "div.app-card-open__authors span[data-test='authors']")
|
||||
authors = authors_elem.text.strip()
|
||||
except Exception:
|
||||
authors = None
|
||||
|
||||
# 期刊/书籍来源
|
||||
try:
|
||||
source_elem = row.find_element(By.CSS_SELECTOR, "div.app-card-open__authors a[data-test='parent']")
|
||||
source = source_elem.text.strip()
|
||||
except Exception:
|
||||
source = None
|
||||
|
||||
# 发表年份/提交时间
|
||||
try:
|
||||
date_elem = row.find_element(By.CSS_SELECTOR, "div.app-card-open__meta [data-test='published']")
|
||||
date = date_elem.text.strip()
|
||||
except Exception:
|
||||
date = None
|
||||
|
||||
print("论文名称:", title)
|
||||
print("作者:", authors)
|
||||
print("期刊来源:", source)
|
||||
print("提交时间:", date)
|
||||
|
||||
# 点击标题进入详情页并获取摘要
|
||||
summary_text = ""
|
||||
try:
|
||||
title_locator = (By.CSS_SELECTOR, "div.app-card-open__main h3.app-card-open__heading a")
|
||||
click_element_safe(driver, title_locator)
|
||||
|
||||
# 等待详情页摘要加载
|
||||
try:
|
||||
abstract_elem = WebDriverWait(driver, 10).until(
|
||||
EC.presence_of_element_located((By.CSS_SELECTOR, 'section[data-title="Abstract"]'))
|
||||
)
|
||||
|
||||
# 如果存在“▽ More”,展开全文
|
||||
try:
|
||||
more_link = abstract_elem.find_element(By.XPATH, ".//a[contains(text(), '▽')]")
|
||||
driver.execute_script("arguments[0].click();", more_link)
|
||||
time.sleep(0.3)
|
||||
except NoSuchElementException:
|
||||
pass
|
||||
|
||||
summary_text = abstract_elem.text.strip()
|
||||
if summary_text.startswith("Abstract"):
|
||||
summary_text = summary_text[len("Abstract"):].lstrip("\n").strip()
|
||||
|
||||
except (TimeoutException, NoSuchElementException):
|
||||
summary_text = ""
|
||||
|
||||
print("摘要:", summary_text)
|
||||
|
||||
time.sleep(1)
|
||||
|
||||
finally:
|
||||
# 回退到列表页
|
||||
driver.back()
|
||||
time.sleep(random.uniform(1.5, 2.5))
|
||||
|
||||
return {
|
||||
"title": title,
|
||||
"author": authors,
|
||||
"source": source,
|
||||
"date": date,
|
||||
"summary": summary_text
|
||||
}
|
||||
def crawl_current_sort(driver, limit):
|
||||
"""抓取当前排序下的 limit 条记录"""
|
||||
fetched_count = 0
|
||||
results = []
|
||||
|
||||
while fetched_count < limit:
|
||||
try:
|
||||
WebDriverWait(driver, 10).until(
|
||||
EC.presence_of_element_located((By.CSS_SELECTOR, '#search-submit'))
|
||||
)
|
||||
except TimeoutException:
|
||||
print("[警告] 本页结果表格未出现,尝试继续")
|
||||
time.sleep(2)
|
||||
|
||||
rows = driver.find_elements(By.CSS_SELECTOR,'#main > div > div > div > div:nth-child(2) > div:nth-child(2) > ol > li')
|
||||
|
||||
for i in range(len(rows)):
|
||||
row = driver.find_elements(By.CSS_SELECTOR,'#main > div > div > div > div:nth-child(2) > div:nth-child(2) > ol > li')[i]
|
||||
if fetched_count >= limit:
|
||||
break
|
||||
try:
|
||||
info = extract_row_info(row, driver)
|
||||
if info:
|
||||
results.append(info)
|
||||
time.sleep(2)
|
||||
fetched_count += 1
|
||||
|
||||
except Exception as e:
|
||||
print(f"[错误] {e}")
|
||||
traceback.print_exc()
|
||||
try:
|
||||
if driver.window_handles:
|
||||
driver.switch_to.window(driver.window_handles[0])
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
# 翻页
|
||||
try:
|
||||
# 尝试定位唯一的下一页按钮(支持不同分页情况)
|
||||
next_btn = driver.find_element(
|
||||
By.CSS_SELECTOR,
|
||||
"ul.eds-c-pagination a[rel='next'], ul.eds-c-pagination a[data-test='next-page']"
|
||||
)
|
||||
|
||||
# 判断是否超过限制
|
||||
if not next_btn.is_enabled() or fetched_count >= limit:
|
||||
break
|
||||
|
||||
_scroll_into_view(driver, next_btn)
|
||||
try:
|
||||
next_btn.click()
|
||||
except Exception:
|
||||
try:
|
||||
ActionChains(driver).move_to_element(next_btn).pause(0.1).click(next_btn).perform()
|
||||
except Exception:
|
||||
driver.execute_script("arguments[0].click();", next_btn)
|
||||
|
||||
time.sleep(1)
|
||||
|
||||
except Exception:
|
||||
print("[INFO] 已到最后一页或翻页失败")
|
||||
break
|
||||
|
||||
return results
|
||||
def springerLink(keyword, limit, sort_options=None):
|
||||
"""主函数:根据选择的排序抓取 SpringerLink 结果"""
|
||||
driver = create_browser()
|
||||
|
||||
wait = WebDriverWait(driver, 15)
|
||||
all_results = {}
|
||||
|
||||
if not sort_options:
|
||||
sort_options = ["relevance"] # 默认相关性
|
||||
|
||||
try:
|
||||
driver.get("http://link.springer.com/")
|
||||
print("网站标题",driver.title)
|
||||
print("当前 URL:", driver.current_url)
|
||||
try:
|
||||
accept_cookies_btn = WebDriverWait(driver, 10).until(
|
||||
EC.element_to_be_clickable(
|
||||
(By.CSS_SELECTOR, "button[data-cc-action='accept']")
|
||||
)
|
||||
)
|
||||
accept_cookies_btn.click()
|
||||
print("[INFO] 已点击 Accept all cookies")
|
||||
except Exception:
|
||||
print("[INFO] 没有发现 Cookies 弹窗")
|
||||
|
||||
try:
|
||||
search_input = WebDriverWait(driver, 10).until(
|
||||
EC.visibility_of_element_located((By.CSS_SELECTOR, "#homepage-search"))
|
||||
)
|
||||
except TimeoutException:
|
||||
print("[ERROR] 搜索框未加载完成")
|
||||
# 输入搜索关键词
|
||||
|
||||
search_input = WebDriverWait(driver, 10).until(
|
||||
EC.element_to_be_clickable((By.CSS_SELECTOR, "#homepage-search"))
|
||||
)
|
||||
driver.execute_script("arguments[0].scrollIntoView(true);", search_input)
|
||||
search_input.clear()
|
||||
search_input.send_keys(keyword)
|
||||
time.sleep(2)
|
||||
driver.find_element(By.CSS_SELECTOR, "#main > div.app-homepage-hero > div > search > form > div > button").click()
|
||||
time.sleep(2)
|
||||
|
||||
# 遍历用户选择的排序
|
||||
for sort_name in sort_options:
|
||||
if sort_name == "relevance":
|
||||
print("[INFO] 使用相关性排序(默认)")
|
||||
# SpringerLink 默认就是相关性,不需要额外点击
|
||||
pass
|
||||
|
||||
elif sort_name == "publication_time":
|
||||
print("[INFO] 切换到最新排序")
|
||||
try:
|
||||
# 点击排序下拉框
|
||||
wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, "#search-select"))).click()
|
||||
time.sleep(1)
|
||||
# 点击 "Newest First" 选项
|
||||
driver.find_element(By.CSS_SELECTOR, "#search-select > option:nth-child(2)").click()
|
||||
time.sleep(2)
|
||||
except Exception as e:
|
||||
print(f"[WARN] 切换最新排序失败: {e}")
|
||||
|
||||
# 抓取当前排序的结果
|
||||
results = crawl_current_sort(driver, limit)
|
||||
all_results[sort_name] = results
|
||||
|
||||
finally:
|
||||
try:
|
||||
driver.quit()
|
||||
except Exception:
|
||||
pass
|
||||
print("[DONE] SpringerLink 抓取完成")
|
||||
|
||||
print(json.dumps(all_results, ensure_ascii=False, indent=2))
|
||||
return all_results
|
||||
|
||||
if __name__ == '__main__':
|
||||
keyword = "graphrag"
|
||||
limit=100
|
||||
# 默认只搜相关性
|
||||
# springerLink(keyword, limit, ["relevance"])
|
||||
# 搜最新
|
||||
springerLink(keyword, limit, ["publication_time"])
|
||||
# 先相关性再最新
|
||||
springerLink(keyword, limit, ["relevance", "publication_time"])
|
|
@ -0,0 +1,243 @@
|
|||
# coding=utf-8
|
||||
import json
|
||||
import time
|
||||
import random
|
||||
import sys
|
||||
import os
|
||||
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
|
||||
import traceback
|
||||
from selenium.webdriver.common.by import By
|
||||
from selenium.webdriver.support.ui import WebDriverWait
|
||||
from selenium.webdriver.support import expected_conditions as EC
|
||||
from selenium.common.exceptions import TimeoutException
|
||||
from selenium.webdriver.common.action_chains import ActionChains
|
||||
from config import create_browser,_scroll_into_view
|
||||
def find_valid_detail_tab(driver, origin_handle, timeout=10):
|
||||
"""从现有句柄中挑出一个不是原始窗口的有效新标签页。"""
|
||||
end_time = time.time() + timeout
|
||||
while time.time() < end_time:
|
||||
for handle in driver.window_handles:
|
||||
if handle != origin_handle:
|
||||
try:
|
||||
driver.switch_to.window(handle)
|
||||
current_url = driver.current_url
|
||||
if not current_url.startswith("chrome://") and current_url != "about:blank":
|
||||
print(f"[万方切换窗口] 成功 → {driver.title}")
|
||||
return handle
|
||||
except Exception:
|
||||
pass
|
||||
time.sleep(0.5)
|
||||
raise Exception("未能在规定时间内找到有效详情页窗口")
|
||||
# ---------主函数 ---------
|
||||
def extract_row_info(row, driver):
|
||||
"""抓取单条记录信息并返回字典"""
|
||||
try:
|
||||
type_text = row.find_element(By.XPATH, 'td[6]').text.strip()
|
||||
except Exception:
|
||||
type_text = ""
|
||||
# 如果不是期刊论文,直接跳过
|
||||
if type_text != "期刊论文":
|
||||
return None
|
||||
|
||||
title_element= row.find_element(By.XPATH, 'td[2]/span[1]')
|
||||
title = row.find_element(By.XPATH, 'td[2]/span[1]').text.strip()
|
||||
|
||||
|
||||
author_area=row.find_element(By.XPATH,'td[3]')
|
||||
authors = author_area.find_elements(By.XPATH, ".//span[@class='authors'][not(contains(text(),'年'))]")
|
||||
author_names = [a.text for a in authors]
|
||||
# 获取期刊来源
|
||||
source = row.find_element(By.XPATH, "td[4]/span").text
|
||||
# 获取期刊时间
|
||||
date = row.find_element(By.XPATH, 'td[5]').text
|
||||
# 获取期刊引用次数
|
||||
quote= row.find_element(By.XPATH, 'td[7]').text
|
||||
# 获取期刊下载次数
|
||||
download = row.find_element(By.XPATH, 'td[8]').text
|
||||
|
||||
print("类型:", type_text)
|
||||
print("论文名称", title)
|
||||
print("作者列表:", author_names)
|
||||
print("期刊来源:", source)
|
||||
print("时间:", date)
|
||||
print("引用次数:", quote)
|
||||
print("下载次数:", download)
|
||||
time.sleep(1)
|
||||
origin = driver.current_window_handle
|
||||
existing_handles = driver.window_handles
|
||||
try:
|
||||
_scroll_into_view(driver, title_element)
|
||||
title_element.click()
|
||||
except Exception:
|
||||
try:
|
||||
ActionChains(driver).move_to_element(title_element).pause(0.1).click(title_element).perform()
|
||||
except Exception:
|
||||
driver.execute_script("arguments[0].click();", title_element)
|
||||
|
||||
try:
|
||||
WebDriverWait(driver, 10).until(EC.new_window_is_opened(existing_handles))
|
||||
except TimeoutException:
|
||||
print("[警告] 未检测到新窗口,跳过")
|
||||
return None
|
||||
|
||||
try:
|
||||
detail_tab = find_valid_detail_tab(driver, origin)
|
||||
if detail_tab not in driver.window_handles:
|
||||
return None
|
||||
driver.switch_to.window(detail_tab)
|
||||
time.sleep(1)
|
||||
|
||||
originalLink = driver.current_url
|
||||
print("详情页链接:", originalLink)
|
||||
|
||||
# 尝试获取摘要
|
||||
summary_text = ""
|
||||
try:
|
||||
summary_container = driver.find_element(By.CSS_SELECTOR, "#essential > div.detailList > div.summary.list")
|
||||
text_span = summary_container.find_element(By.CSS_SELECTOR, "span.text-overflow > span > span")
|
||||
summary_text = text_span.text
|
||||
except Exception:
|
||||
# 没找到元素,就保持 summary_text 为空
|
||||
print("[警告] 摘要信息未找到")
|
||||
|
||||
# 判断折叠按钮
|
||||
try:
|
||||
expand_btn = summary_container.find_element(By.CSS_SELECTOR,
|
||||
"span.slot-box > span.abstractIcon.btn[title='查看全部']")
|
||||
driver.execute_script("arguments[0].click();", expand_btn)
|
||||
time.sleep(1)
|
||||
summary_text = text_span.text
|
||||
except Exception:
|
||||
pass
|
||||
print("摘要:", summary_text)
|
||||
|
||||
#获取关键词信息
|
||||
# 定位关键词容器
|
||||
keyword_container = driver.find_element(By.CSS_SELECTOR, "#essential > div.detailList > div.keyword.list")
|
||||
# 定位里面所有的 span
|
||||
keyword_elements = keyword_container.find_elements(By.CSS_SELECTOR, "div.itemKeyword a span")
|
||||
# 提取文本
|
||||
keywords = [el.text.strip() for el in keyword_elements]
|
||||
print("关键词:", keywords)
|
||||
time.sleep(1)
|
||||
finally:
|
||||
# 仅关闭非原窗口
|
||||
if driver.current_window_handle != origin:
|
||||
driver.close()
|
||||
driver.switch_to.window(origin)
|
||||
time.sleep(random.uniform(0.5, 1.5))
|
||||
|
||||
return {
|
||||
"title": title, # 确保函数里有定义
|
||||
"author": author_names,
|
||||
"source": source,
|
||||
"date": date,
|
||||
"site":"万方",
|
||||
"quote": quote,
|
||||
"originalLink": originalLink,
|
||||
"download": download,
|
||||
"keywords": keywords,
|
||||
"summary": summary_text
|
||||
}
|
||||
def crawl_current_sort(driver, limit):
|
||||
"""抓取当前排序下的 limit 条记录"""
|
||||
fetched_count = 0
|
||||
results = []
|
||||
|
||||
while fetched_count < limit:
|
||||
try:
|
||||
WebDriverWait(driver, 10).until(
|
||||
EC.presence_of_element_located((By.CSS_SELECTOR, '#anxs-logoName_sns'))
|
||||
)
|
||||
except TimeoutException:
|
||||
print("[警告] 本页结果表格未出现,尝试继续")
|
||||
time.sleep(2)
|
||||
|
||||
rows = driver.find_elements(By.XPATH, '/ html / body / div[5] / div / div[3] / div[2] / div / div[4] / div[2] / div[1] / table / tbody / tr')
|
||||
|
||||
for row in rows:
|
||||
if fetched_count >= limit:
|
||||
break
|
||||
try:
|
||||
info = extract_row_info(row, driver)
|
||||
if info:
|
||||
results.append(info)
|
||||
time.sleep(2)
|
||||
fetched_count += 1
|
||||
|
||||
except Exception as e:
|
||||
print(f"[错误] {e}")
|
||||
traceback.print_exc()
|
||||
try:
|
||||
if driver.window_handles:
|
||||
driver.switch_to.window(driver.window_handles[0])
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
# 翻页
|
||||
try:
|
||||
next_btn = driver.find_element(By.XPATH, "/html/body/div[5]/div/div[3]/div[2]/div/div[3]/div[2]/div[4]/span[3]")
|
||||
if not next_btn.is_enabled() or fetched_count >= limit:
|
||||
break
|
||||
_scroll_into_view(driver, next_btn)
|
||||
try:
|
||||
next_btn.click()
|
||||
except Exception:
|
||||
try:
|
||||
ActionChains(driver).move_to_element(next_btn).pause(0.1).click(next_btn).perform()
|
||||
except Exception:
|
||||
driver.execute_script("arguments[0].click();", next_btn)
|
||||
time.sleep(1)
|
||||
except Exception:
|
||||
print("[INFO] 已到最后一页或翻页失败")
|
||||
break
|
||||
|
||||
return results
|
||||
def wangfang(keyword, limit, sort_options=None):
|
||||
"""主函数:三种排序抓取"""
|
||||
driver = create_browser()
|
||||
wait = WebDriverWait(driver, 15)
|
||||
all_results = {}
|
||||
if not sort_options:
|
||||
sort_options = ["relevance"] # 默认相关性
|
||||
try:
|
||||
driver.get("https://www.wanfangdata.com.cn/index.html")
|
||||
wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, "#search-input"))).send_keys(keyword)
|
||||
driver.find_element(By.CLASS_NAME, "search-icon").click()
|
||||
time.sleep(1)
|
||||
#切换展示模式
|
||||
element=driver.find_element(By.CLASS_NAME, "toggle-table-list")
|
||||
driver.execute_script("arguments[0].scrollIntoView({block: 'center'});", element)
|
||||
time.sleep(2)
|
||||
element.click()
|
||||
for sort_name in sort_options:
|
||||
if sort_name == "relevance":
|
||||
print("[INFO] 使用相关性排序(默认)")
|
||||
elif sort_name == "download_count":
|
||||
print("[INFO] 使用下载量排序")
|
||||
try:
|
||||
driver.find_element(By.XPATH, '//span[text()="被引频次"]').click()
|
||||
except Exception:
|
||||
print(f"[WARN] 点击排序 {sort_name} 失败")
|
||||
elif sort_name == "publication_time":
|
||||
print("[INFO] 使用时间排序")
|
||||
try:
|
||||
driver.find_element(By.XPATH, '//span[text()="出版时间"]').click()
|
||||
except Exception:
|
||||
print(f"[WARN] 点击排序 {sort_name} 失败")
|
||||
time.sleep(1)
|
||||
results = crawl_current_sort(driver, limit)
|
||||
all_results[sort_name] = results
|
||||
|
||||
finally:
|
||||
try:
|
||||
driver.quit()
|
||||
except Exception:
|
||||
pass
|
||||
print("[DONE] PDF处理完成")
|
||||
print(json.dumps(all_results, ensure_ascii=False, indent=2))
|
||||
return all_results
|
||||
if __name__ == '__main__':
|
||||
keyword = "知识图谱"
|
||||
limit=100
|
||||
wangfang(keyword,limit, ["relevance", "publication_time"])
|
|
@ -0,0 +1,280 @@
|
|||
# coding=utf-8
|
||||
import json
|
||||
import time
|
||||
import random
|
||||
import sys
|
||||
import os
|
||||
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
|
||||
import traceback
|
||||
from selenium.webdriver.common.by import By
|
||||
from selenium.webdriver.support.ui import WebDriverWait
|
||||
from selenium.webdriver.support import expected_conditions as EC
|
||||
from selenium.common.exceptions import TimeoutException, NoSuchWindowException, WebDriverException
|
||||
from selenium.webdriver.common.action_chains import ActionChains
|
||||
from config import create_browser,_scroll_into_view
|
||||
def find_valid_detail_tab(driver, origin_handle, timeout=10):
|
||||
"""从现有句柄中挑出一个不是原始窗口的有效新标签页。"""
|
||||
end_time = time.time() + timeout
|
||||
while time.time() < end_time:
|
||||
for handle in driver.window_handles:
|
||||
if handle != origin_handle:
|
||||
try:
|
||||
driver.switch_to.window(handle)
|
||||
current_url = driver.current_url
|
||||
if not current_url.startswith("chrome://") and current_url != "about:blank":
|
||||
print(f"[维普切换窗口] 成功 → {driver.title}")
|
||||
return handle
|
||||
except Exception:
|
||||
pass
|
||||
time.sleep(0.5)
|
||||
raise Exception("未能在规定时间内找到有效详情页窗口")
|
||||
# ---------主函数 ---------
|
||||
def extract_row_info(row, driver):
|
||||
"""抓取单条记录信息并返回字典"""
|
||||
|
||||
try:
|
||||
type_text = row.find_element(By.XPATH, 'td[5]/div/span').text.strip()
|
||||
except Exception:
|
||||
type_text = ""
|
||||
# 如果不是期刊论文,直接跳过
|
||||
if type_text != "期刊论文":
|
||||
return None
|
||||
|
||||
try:
|
||||
title_element = row.find_element(By.XPATH, 'td[2]/div/div/a')
|
||||
title = title_element.text.strip()
|
||||
print("论文名称:", title)
|
||||
except Exception:
|
||||
print("[错误] 标题元素未找到")
|
||||
return None
|
||||
|
||||
try:
|
||||
author_elems = row.find_elements(
|
||||
By.XPATH,
|
||||
".//div[contains(@class,'six-wrap')]//*[@data-warden-event-id='author-click']"
|
||||
)
|
||||
authors = [e.text.strip() for e in author_elems if e.text.strip()]
|
||||
except Exception:
|
||||
authors = []
|
||||
print("作者列表:", authors)
|
||||
|
||||
try:
|
||||
source = row.find_element(By.XPATH, "td[4]/div/a").text
|
||||
except Exception:
|
||||
source = ""
|
||||
print("期刊来源:", source)
|
||||
print("类型:", type_text)
|
||||
|
||||
time.sleep(1)
|
||||
try:
|
||||
origin = driver.current_window_handle
|
||||
except NoSuchWindowException:
|
||||
print("[错误] 当前窗口不存在")
|
||||
return None
|
||||
|
||||
existing_handles = driver.window_handles
|
||||
# 点击标题打开新窗口
|
||||
clicked = False
|
||||
try:
|
||||
_scroll_into_view(driver, title_element)
|
||||
title_element.click()
|
||||
clicked = True
|
||||
except Exception:
|
||||
try:
|
||||
ActionChains(driver).move_to_element(title_element).pause(1).click(title_element).perform()
|
||||
clicked = True
|
||||
except Exception:
|
||||
try:
|
||||
driver.execute_script("arguments[0].click();", title_element)
|
||||
clicked = True
|
||||
except Exception:
|
||||
print("[错误] 点击标题失败")
|
||||
clicked = False
|
||||
|
||||
if not clicked:
|
||||
return None
|
||||
|
||||
try:
|
||||
WebDriverWait(driver, 10).until(EC.new_window_is_opened(existing_handles))
|
||||
except TimeoutException:
|
||||
print("[警告] 未检测到新窗口,跳过")
|
||||
return None
|
||||
|
||||
try:
|
||||
# 获取新窗口句柄
|
||||
new_handles = driver.window_handles
|
||||
detail_tab = next((h for h in new_handles if h != origin), None)
|
||||
if not detail_tab:
|
||||
print("[警告] 找不到新窗口")
|
||||
return None
|
||||
|
||||
driver.switch_to.window(detail_tab)
|
||||
time.sleep(1)
|
||||
|
||||
try:
|
||||
originalLink = driver.current_url
|
||||
print("详情页链接:", originalLink)
|
||||
except NoSuchWindowException:
|
||||
print("[错误] 新窗口已关闭")
|
||||
return None
|
||||
|
||||
# 获取摘要
|
||||
summary_text = ""
|
||||
try:
|
||||
abstract_elems = driver.find_elements(By.CSS_SELECTOR, "span.ellipsis.content-text")
|
||||
if abstract_elems:
|
||||
summary_text = abstract_elems[0].text.strip()
|
||||
else:
|
||||
print("[警告] 摘要信息未找到")
|
||||
except Exception:
|
||||
summary_text = ""
|
||||
print("摘要:", summary_text)
|
||||
|
||||
# 获取关键词
|
||||
keywords = []
|
||||
try:
|
||||
keyword_container = driver.find_element(By.XPATH, "//div[contains(., '关键词')]")
|
||||
keyword_spans = keyword_container.find_elements(By.CSS_SELECTOR, "span.select_hover.pointer span")
|
||||
keywords = [k.text.strip() for k in keyword_spans if k.text.strip()]
|
||||
except Exception:
|
||||
keywords = []
|
||||
print("关键词列表:", keywords)
|
||||
|
||||
time.sleep(1)
|
||||
|
||||
except (NoSuchWindowException, WebDriverException):
|
||||
print("[警告] 窗口操作失败")
|
||||
return None
|
||||
|
||||
finally:
|
||||
# 安全关闭新窗口
|
||||
try:
|
||||
if driver.current_window_handle != origin:
|
||||
driver.close()
|
||||
driver.switch_to.window(origin)
|
||||
time.sleep(random.uniform(0.5, 1.0))
|
||||
except (NoSuchWindowException, WebDriverException):
|
||||
print("[警告] 无法切回原窗口")
|
||||
|
||||
return {
|
||||
"title": title,
|
||||
"author": authors,
|
||||
"source": source,
|
||||
"site":"维普",
|
||||
"keywords": keywords,
|
||||
"originalLink": originalLink if 'originalLink' in locals() else "",
|
||||
"summary": summary_text
|
||||
}
|
||||
def crawl_current_sort(driver, limit):
|
||||
"""抓取当前排序下的 limit 条记录"""
|
||||
fetched_count = 0
|
||||
results = []
|
||||
|
||||
while fetched_count < limit:
|
||||
try:
|
||||
WebDriverWait(driver, 10).until(
|
||||
EC.presence_of_element_located((By.CSS_SELECTOR, '#search_container > div.s-list > div.yx-start.content.al-str'))
|
||||
)
|
||||
except TimeoutException:
|
||||
print("[警告] 本页结果表格未出现,尝试继续")
|
||||
time.sleep(2)
|
||||
|
||||
rows = driver.find_elements(By.XPATH, '//*[@id="search_container"]/div[2]/div[2]/div/div/div[3]/table/tbody/tr')
|
||||
|
||||
|
||||
for row in rows:
|
||||
if fetched_count >= limit:
|
||||
break
|
||||
try:
|
||||
info = extract_row_info(row, driver)
|
||||
if info:
|
||||
results.append(info)
|
||||
time.sleep(2)
|
||||
fetched_count += 1
|
||||
|
||||
except Exception as e:
|
||||
print(f"[错误] {e}")
|
||||
traceback.print_exc()
|
||||
try:
|
||||
if driver.window_handles:
|
||||
driver.switch_to.window(driver.window_handles[0])
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
# 翻页
|
||||
try:
|
||||
next_btn = driver.find_element(By.CSS_SELECTOR, "i.el-icon-arrow-right.pointer")
|
||||
|
||||
if not next_btn.is_enabled() or fetched_count >= limit:
|
||||
break
|
||||
_scroll_into_view(driver, next_btn)
|
||||
try:
|
||||
next_btn.click()
|
||||
except Exception:
|
||||
try:
|
||||
ActionChains(driver).move_to_element(next_btn).pause(0.1).click(next_btn).perform()
|
||||
except Exception:
|
||||
driver.execute_script("arguments[0].click();", next_btn)
|
||||
time.sleep(5)
|
||||
except Exception:
|
||||
print("[INFO] 已到最后一页或翻页失败")
|
||||
break
|
||||
|
||||
return results
|
||||
def weipu(keyword, limit,sort_options=None):
|
||||
"""主函数:三种排序抓取"""
|
||||
driver = create_browser()
|
||||
wait = WebDriverWait(driver, 15)
|
||||
all_results = {}
|
||||
if not sort_options:
|
||||
sort_options = ["relevance"] # 默认相关性
|
||||
try:
|
||||
driver.get("https://www.cqvip.com/")
|
||||
search_input = driver.find_element(By.XPATH, "//input[@placeholder='请输入检索词']")
|
||||
search_input.send_keys(keyword)
|
||||
time.sleep(2)
|
||||
search_button = driver.find_element(By.XPATH, "//button[.//span[contains(text(),'检索')]]")
|
||||
search_button.click()
|
||||
time.sleep(3)
|
||||
#切换展示模式
|
||||
element = WebDriverWait(driver, 10).until(
|
||||
EC.element_to_be_clickable(
|
||||
(By.CSS_SELECTOR, 'i[data-warden-event-id="list-arrange"]')
|
||||
)
|
||||
)
|
||||
driver.execute_script("arguments[0].scrollIntoView({block: 'center'});", element)
|
||||
time.sleep(2)
|
||||
element.click()
|
||||
time.sleep(2)
|
||||
for sort_name in sort_options:
|
||||
if sort_name == "relevance":
|
||||
print("[INFO] 使用相关性排序(默认)")
|
||||
elif sort_name == "download_count":
|
||||
print("[INFO] 使用被引量排序")
|
||||
try:
|
||||
driver.find_element(By.XPATH, '//span[contains(text(),"被引量")]').click()
|
||||
except Exception:
|
||||
print(f"[WARN] 点击排序 {sort_name} 失败")
|
||||
elif sort_name == "publication_time":
|
||||
print("[INFO] 使用时间排序")
|
||||
try:
|
||||
driver.find_element(By.XPATH, '//span[contains(text(),"时效性")]').click()
|
||||
except Exception:
|
||||
print(f"[WARN] 点击排序 {sort_name} 失败")
|
||||
time.sleep(1)
|
||||
results = crawl_current_sort(driver, limit)
|
||||
all_results[sort_name] = results
|
||||
finally:
|
||||
try:
|
||||
driver.quit()
|
||||
except Exception:
|
||||
pass
|
||||
print("[DONE] PDF处理完成")
|
||||
print(json.dumps(all_results, ensure_ascii=False, indent=2))
|
||||
|
||||
return all_results
|
||||
|
||||
if __name__ == '__main__':
|
||||
keyword = "深度学习"
|
||||
limit=10
|
||||
weipu(keyword,limit,["relevance"])
|
|
@ -0,0 +1,271 @@
|
|||
# coding=utf-8
|
||||
import csv
|
||||
import json
|
||||
import sys
|
||||
import os
|
||||
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
|
||||
import time
|
||||
import random
|
||||
import traceback
|
||||
from selenium.webdriver.common.by import By
|
||||
from selenium.webdriver.support.ui import WebDriverWait
|
||||
from selenium.webdriver.support import expected_conditions as EC
|
||||
from selenium.common.exceptions import TimeoutException, NoSuchWindowException, WebDriverException
|
||||
from selenium.webdriver.common.action_chains import ActionChains
|
||||
from config import create_browser,_scroll_into_view
|
||||
def find_valid_detail_tab(driver, origin_handle, timeout=10):
|
||||
"""从现有句柄中挑出一个不是 chrome:// 且标题包含知网的标签页。"""
|
||||
end_time = time.time() + timeout
|
||||
while time.time() < end_time:
|
||||
for handle in driver.window_handles:
|
||||
if handle != origin_handle:
|
||||
try:
|
||||
driver.switch_to.window(handle)
|
||||
current_url = driver.current_url
|
||||
current_title = driver.title
|
||||
if not current_url.startswith("chrome://") and ("知网" in current_title or "CNKI" in current_title.upper()):
|
||||
print(f"[知网切换窗口] 成功 → {current_title}")
|
||||
return handle
|
||||
except Exception:
|
||||
pass
|
||||
time.sleep(0.5)
|
||||
raise Exception("未能在规定时间内找到有效详情页窗口")
|
||||
# ---------主函数 ---------
|
||||
def extract_row_info(row, driver):
|
||||
"""抓取单条记录信息并返回字典"""
|
||||
td_name = None
|
||||
for _ in range(3):
|
||||
try:
|
||||
td_name = row.find_element(By.CSS_SELECTOR, 'td.name')
|
||||
break
|
||||
except Exception:
|
||||
time.sleep(0.3)
|
||||
if not td_name:
|
||||
return None
|
||||
|
||||
a_tags = td_name.find_elements(By.TAG_NAME, 'a')
|
||||
if not a_tags:
|
||||
return None
|
||||
link_elem = a_tags[0]
|
||||
title = (link_elem.text or "").strip()
|
||||
if not title:
|
||||
return None
|
||||
|
||||
try:
|
||||
author = row.find_element(By.CSS_SELECTOR, 'td.author').text
|
||||
except Exception:
|
||||
author = ""
|
||||
try:
|
||||
source = row.find_element(By.CSS_SELECTOR, 'td.source').text
|
||||
except Exception:
|
||||
source = ""
|
||||
try:
|
||||
date = row.find_element(By.CSS_SELECTOR, 'td.date').text
|
||||
except Exception:
|
||||
date = ""
|
||||
try:
|
||||
quote = row.find_element(By.CSS_SELECTOR, 'td.quote').text
|
||||
except Exception:
|
||||
quote = ""
|
||||
try:
|
||||
download = row.find_element(By.CSS_SELECTOR, 'td.download').text
|
||||
except Exception:
|
||||
download = ""
|
||||
|
||||
print(f"作者:{author}")
|
||||
print(f"来源:{source}")
|
||||
print(f"出版时间:{date}")
|
||||
print(f"被引频次:{quote}")
|
||||
print(f"下载次数:{download}")
|
||||
print("-" * 50)
|
||||
|
||||
try:
|
||||
origin = driver.current_window_handle
|
||||
except Exception:
|
||||
print("[警告] 当前窗口不可用")
|
||||
return None
|
||||
|
||||
existing_handles = driver.window_handles.copy()
|
||||
try:
|
||||
_scroll_into_view(driver, link_elem)
|
||||
link_elem.click()
|
||||
except Exception:
|
||||
try:
|
||||
ActionChains(driver).move_to_element(link_elem).pause(0.1).click(link_elem).perform()
|
||||
except Exception:
|
||||
try:
|
||||
driver.execute_script("arguments[0].click();", link_elem)
|
||||
except Exception:
|
||||
print("[警告] 点击失败")
|
||||
return None
|
||||
|
||||
try:
|
||||
WebDriverWait(driver, 10).until(EC.new_window_is_opened(existing_handles))
|
||||
except TimeoutException:
|
||||
print("[警告] 未检测到新窗口,跳过")
|
||||
return None
|
||||
|
||||
originalLink = ""
|
||||
keywords = []
|
||||
summary = ""
|
||||
try:
|
||||
detail_tab = find_valid_detail_tab(driver, origin)
|
||||
if detail_tab not in driver.window_handles:
|
||||
print("[警告] 新窗口不存在")
|
||||
return None
|
||||
|
||||
try:
|
||||
driver.switch_to.window(detail_tab)
|
||||
time.sleep(0.5)
|
||||
originalLink = driver.current_url
|
||||
except Exception:
|
||||
print("[警告] 无法切换到新窗口")
|
||||
return None
|
||||
|
||||
try:
|
||||
keywords = [kw.text for kw in driver.find_elements(
|
||||
By.XPATH,
|
||||
"//span[@class='rowtit' and text()='关键词:']/following-sibling::p[@class='keywords']/a"
|
||||
)]
|
||||
except Exception:
|
||||
keywords = []
|
||||
|
||||
try:
|
||||
summary = driver.find_element(By.XPATH, '//*[@id="ChDivSummary"]').text
|
||||
except Exception:
|
||||
summary = ""
|
||||
|
||||
print(f"关键词{keywords}")
|
||||
print(f"摘要{summary}")
|
||||
|
||||
finally:
|
||||
try:
|
||||
if driver.current_window_handle != origin:
|
||||
driver.close()
|
||||
except Exception:
|
||||
pass
|
||||
try:
|
||||
driver.switch_to.window(origin)
|
||||
except Exception:
|
||||
print("[警告] 无法切回原窗口")
|
||||
time.sleep(random.uniform(0.5, 1.5))
|
||||
|
||||
return {
|
||||
"title": title,
|
||||
"author": author,
|
||||
"source": source,
|
||||
"date": date,
|
||||
"site": "知网",
|
||||
"originalLink": originalLink,
|
||||
"quote": quote,
|
||||
"download": download,
|
||||
"keywords": keywords,
|
||||
"summary": summary
|
||||
}
|
||||
def crawl_current_sort(driver, limit):
|
||||
"""抓取当前排序下的 limit 条记录"""
|
||||
fetched_count = 0
|
||||
results = []
|
||||
|
||||
while fetched_count < limit:
|
||||
try:
|
||||
WebDriverWait(driver, 10).until(
|
||||
EC.presence_of_element_located((By.CSS_SELECTOR, '#gridTable table tbody'))
|
||||
)
|
||||
except TimeoutException:
|
||||
print("[警告] 本页结果表格未出现,尝试继续")
|
||||
time.sleep(1)
|
||||
|
||||
rows = driver.find_elements(By.CSS_SELECTOR, '#gridTable > div > div > div > table > tbody > tr')
|
||||
for row in rows:
|
||||
if fetched_count >= limit:
|
||||
break
|
||||
try:
|
||||
info = extract_row_info(row, driver)
|
||||
if info:
|
||||
results.append(info)
|
||||
fetched_count += 1
|
||||
print(f"[{fetched_count}] {info['title']}")
|
||||
except Exception as e:
|
||||
print(f"[错误] {e}")
|
||||
traceback.print_exc()
|
||||
try:
|
||||
if driver.window_handles:
|
||||
driver.switch_to.window(driver.window_handles[0])
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
# 翻页
|
||||
try:
|
||||
next_btn = driver.find_element(By.ID, "PageNext")
|
||||
if not next_btn.is_enabled() or fetched_count >= limit:
|
||||
break
|
||||
_scroll_into_view(driver, next_btn)
|
||||
try:
|
||||
next_btn.click()
|
||||
except Exception:
|
||||
try:
|
||||
ActionChains(driver).move_to_element(next_btn).pause(0.1).click(next_btn).perform()
|
||||
except Exception:
|
||||
driver.execute_script("arguments[0].click();", next_btn)
|
||||
time.sleep(1)
|
||||
except Exception:
|
||||
print("[INFO] 已到最后一页或翻页失败")
|
||||
break
|
||||
|
||||
return results
|
||||
def zhiwang(keyword, limit,sort_options=None):
|
||||
"""主函数:四种排序抓取"""
|
||||
print(f"[DEBUG][zhiwang] Received parameters: keyword='{keyword}', limit={limit}, sort_options={sort_options}")
|
||||
driver = create_browser()
|
||||
wait = WebDriverWait(driver, 15)
|
||||
all_results = {}
|
||||
if not sort_options:
|
||||
sort_options = ["publication_time"] # 默认相关性
|
||||
|
||||
try:
|
||||
driver.get("https://www.cnki.net")
|
||||
wait.until(EC.presence_of_element_located((By.ID, "txt_SearchText"))).send_keys(keyword)
|
||||
driver.find_element(By.CLASS_NAME, "search-btn").click()
|
||||
time.sleep(2)
|
||||
|
||||
for sort_name in sort_options:
|
||||
if sort_name == "publication_time":
|
||||
print("[INFO] 使用发表时间排序(默认)")
|
||||
elif sort_name == "download_count":
|
||||
print("[INFO] 使用下载量排序")
|
||||
try:
|
||||
download = driver.find_element(By.XPATH, '//ul[@id="orderList"]/li[text()="下载"]')
|
||||
download.click()
|
||||
except Exception:
|
||||
print(f"[WARN] 点击排序 {sort_name} 失败")
|
||||
elif sort_name == "cited_count":
|
||||
print("[INFO] 使用被引量排序")
|
||||
try:
|
||||
download = driver.find_element(By.XPATH, '//ul[@id="orderList"]/li[text()="被引"]')
|
||||
download.click()
|
||||
except Exception:
|
||||
print(f"[WARN] 点击排序 {sort_name} 失败")
|
||||
elif sort_name == "relevance":
|
||||
print("[INFO] 使用相关度排序")
|
||||
try:
|
||||
relevance = driver.find_element(By.XPATH, '//ul[@id="orderList"]/li[text()="相关度"]')
|
||||
relevance.click()
|
||||
except Exception:
|
||||
print(f"[WARN] 点击排序 {sort_name} 失败")
|
||||
time.sleep(1)
|
||||
results = crawl_current_sort(driver, limit)
|
||||
all_results[sort_name] = results
|
||||
finally:
|
||||
try:
|
||||
driver.quit()
|
||||
except Exception:
|
||||
pass
|
||||
print("[DONE] PDF处理完成")
|
||||
print(json.dumps(all_results, ensure_ascii=False, indent=2))
|
||||
return all_results
|
||||
|
||||
if __name__ == '__main__':
|
||||
keyword = "graphrag"
|
||||
limit=10
|
||||
zhiwang(keyword,limit,["relevance", "publication_time"])
|
|
@ -1 +0,0 @@
|
|||
Subproject commit 891bb94c9c0424de4aac9c4112a3c000bed7af87
|
|
@ -0,0 +1,10 @@
|
|||
SECRET_KEY=your-secret-key
|
||||
DEBUG=True
|
||||
ALLOWED_HOSTS=127.0.0.1,localhost
|
||||
|
||||
DB_ENGINE=django.db.backends.sqlite3
|
||||
DB_NAME=db.sqlite3
|
||||
DB_USER=
|
||||
DB_PASSWORD=
|
||||
DB_HOST=
|
||||
DB_PORT=
|
|
@ -0,0 +1,8 @@
|
|||
# Default ignored files
|
||||
/shelf/
|
||||
/workspace.xml
|
||||
# Editor-based HTTP Client requests
|
||||
/httpRequests/
|
||||
# Datasource local storage ignored files
|
||||
/dataSources/
|
||||
/dataSources.local.xml
|
|
@ -0,0 +1,79 @@
|
|||
<component name="InspectionProjectProfileManager">
|
||||
<profile version="1.0">
|
||||
<option name="myName" value="Project Default" />
|
||||
<inspection_tool class="Eslint" enabled="true" level="WARNING" enabled_by_default="true" />
|
||||
<inspection_tool class="PyPackageRequirementsInspection" enabled="true" level="WARNING" enabled_by_default="true">
|
||||
<option name="ignoredPackages">
|
||||
<value>
|
||||
<list size="57">
|
||||
<item index="0" class="java.lang.String" itemvalue="tqdm" />
|
||||
<item index="1" class="java.lang.String" itemvalue="scipy" />
|
||||
<item index="2" class="java.lang.String" itemvalue="h5py" />
|
||||
<item index="3" class="java.lang.String" itemvalue="matplotlib" />
|
||||
<item index="4" class="java.lang.String" itemvalue="torch" />
|
||||
<item index="5" class="java.lang.String" itemvalue="numpy" />
|
||||
<item index="6" class="java.lang.String" itemvalue="torchvision" />
|
||||
<item index="7" class="java.lang.String" itemvalue="opencv_python" />
|
||||
<item index="8" class="java.lang.String" itemvalue="Pillow" />
|
||||
<item index="9" class="java.lang.String" itemvalue="charset-normalizer" />
|
||||
<item index="10" class="java.lang.String" itemvalue="torchaudio" />
|
||||
<item index="11" class="java.lang.String" itemvalue="tokenizers" />
|
||||
<item index="12" class="java.lang.String" itemvalue="transformers" />
|
||||
<item index="13" class="java.lang.String" itemvalue="referencing" />
|
||||
<item index="14" class="java.lang.String" itemvalue="tzlocal" />
|
||||
<item index="15" class="java.lang.String" itemvalue="alibabacloud_openapi_util" />
|
||||
<item index="16" class="java.lang.String" itemvalue="python-dateutil" />
|
||||
<item index="17" class="java.lang.String" itemvalue="cffi" />
|
||||
<item index="18" class="java.lang.String" itemvalue="alibabacloud-dingtalk" />
|
||||
<item index="19" class="java.lang.String" itemvalue="MarkupSafe" />
|
||||
<item index="20" class="java.lang.String" itemvalue="Jinja2" />
|
||||
<item index="21" class="java.lang.String" itemvalue="frozenlist" />
|
||||
<item index="22" class="java.lang.String" itemvalue="jsonschema-specifications" />
|
||||
<item index="23" class="java.lang.String" itemvalue="exceptiongroup" />
|
||||
<item index="24" class="java.lang.String" itemvalue="alibabacloud-credentials" />
|
||||
<item index="25" class="java.lang.String" itemvalue="alibabacloud_gateway_dingtalk" />
|
||||
<item index="26" class="java.lang.String" itemvalue="certifi" />
|
||||
<item index="27" class="java.lang.String" itemvalue="anyio" />
|
||||
<item index="28" class="java.lang.String" itemvalue="alibabacloud-credentials-api" />
|
||||
<item index="29" class="java.lang.String" itemvalue="et_xmlfile" />
|
||||
<item index="30" class="java.lang.String" itemvalue="alibabacloud_tea_openapi" />
|
||||
<item index="31" class="java.lang.String" itemvalue="jsonschema" />
|
||||
<item index="32" class="java.lang.String" itemvalue="darabonba-core" />
|
||||
<item index="33" class="java.lang.String" itemvalue="flask-restx" />
|
||||
<item index="34" class="java.lang.String" itemvalue="importlib_resources" />
|
||||
<item index="35" class="java.lang.String" itemvalue="alibabacloud_tea_util" />
|
||||
<item index="36" class="java.lang.String" itemvalue="aiofiles" />
|
||||
<item index="37" class="java.lang.String" itemvalue="aiohappyeyeballs" />
|
||||
<item index="38" class="java.lang.String" itemvalue="cryptography" />
|
||||
<item index="39" class="java.lang.String" itemvalue="alibabacloud_gateway_spi" />
|
||||
<item index="40" class="java.lang.String" itemvalue="APScheduler" />
|
||||
<item index="41" class="java.lang.String" itemvalue="attrs" />
|
||||
<item index="42" class="java.lang.String" itemvalue="chardet" />
|
||||
<item index="43" class="java.lang.String" itemvalue="pandas" />
|
||||
<item index="44" class="java.lang.String" itemvalue="alibabacloud-tea" />
|
||||
<item index="45" class="java.lang.String" itemvalue="lark-oapi" />
|
||||
<item index="46" class="java.lang.String" itemvalue="colorama" />
|
||||
<item index="47" class="java.lang.String" itemvalue="aiohttp" />
|
||||
<item index="48" class="java.lang.String" itemvalue="multidict" />
|
||||
<item index="49" class="java.lang.String" itemvalue="yarl" />
|
||||
<item index="50" class="java.lang.String" itemvalue="aiosignal" />
|
||||
<item index="51" class="java.lang.String" itemvalue="idna" />
|
||||
<item index="52" class="java.lang.String" itemvalue="openpyxl" />
|
||||
<item index="53" class="java.lang.String" itemvalue="requests-toolbelt" />
|
||||
<item index="54" class="java.lang.String" itemvalue="pymysql" />
|
||||
<item index="55" class="java.lang.String" itemvalue="poplib" />
|
||||
<item index="56" class="java.lang.String" itemvalue="sqlalchemy" />
|
||||
</list>
|
||||
</value>
|
||||
</option>
|
||||
</inspection_tool>
|
||||
<inspection_tool class="PyPep8NamingInspection" enabled="true" level="WEAK WARNING" enabled_by_default="true">
|
||||
<option name="ignoredErrors">
|
||||
<list>
|
||||
<option value="N806" />
|
||||
</list>
|
||||
</option>
|
||||
</inspection_tool>
|
||||
<inspection_tool class="TsLint" enabled="true" level="WARNING" enabled_by_default="true" />
|
||||
</profile>
|
||||
</component>
|
|
@ -0,0 +1,6 @@
|
|||
<component name="InspectionProjectProfileManager">
|
||||
<settings>
|
||||
<option name="USE_PROJECT_PROFILE" value="false" />
|
||||
<version value="1.0" />
|
||||
</settings>
|
||||
</component>
|
|
@ -0,0 +1,4 @@
|
|||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<project version="4">
|
||||
<component name="ProjectRootManager" version="2" project-jdk-name="Python 3.10 (selenium_django)" project-jdk-type="Python SDK" />
|
||||
</project>
|
|
@ -0,0 +1,8 @@
|
|||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<project version="4">
|
||||
<component name="ProjectModuleManager">
|
||||
<modules>
|
||||
<module fileurl="file://$PROJECT_DIR$/.idea/selenium_django.iml" filepath="$PROJECT_DIR$/.idea/selenium_django.iml" />
|
||||
</modules>
|
||||
</component>
|
||||
</project>
|
|
@ -0,0 +1,27 @@
|
|||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<module type="PYTHON_MODULE" version="4">
|
||||
<component name="FacetManager">
|
||||
<facet type="django" name="Django">
|
||||
<configuration>
|
||||
<option name="rootFolder" value="$MODULE_DIR$" />
|
||||
<option name="settingsModule" value="selenium_django/settings.py" />
|
||||
<option name="manageScript" value="$MODULE_DIR$/manage.py" />
|
||||
<option name="environment" value="<map/>" />
|
||||
<option name="doNotUseTestRunner" value="false" />
|
||||
<option name="trackFilePattern" value="migrations" />
|
||||
</configuration>
|
||||
</facet>
|
||||
</component>
|
||||
<component name="NewModuleRootManager">
|
||||
<content url="file://$MODULE_DIR$" />
|
||||
<orderEntry type="jdk" jdkName="Python 3.10 (selenium_django)" jdkType="Python SDK" />
|
||||
<orderEntry type="sourceFolder" forTests="false" />
|
||||
</component>
|
||||
<component name="PyDocumentationSettings">
|
||||
<option name="format" value="PLAIN" />
|
||||
<option name="myDocStringFormat" value="Plain" />
|
||||
</component>
|
||||
<component name="TemplatesService">
|
||||
<option name="TEMPLATE_CONFIGURATION" value="Django" />
|
||||
</component>
|
||||
</module>
|
|
@ -0,0 +1,32 @@
|
|||
FROM python:3.11-slim
|
||||
|
||||
WORKDIR /app
|
||||
|
||||
# 设置国内 pip 镜像
|
||||
RUN pip config set global.index-url https://pypi.tuna.tsinghua.edu.cn/simple
|
||||
|
||||
# 复制依赖并安装
|
||||
COPY requirements.txt .
|
||||
RUN pip install --no-cache-dir -r requirements.txt
|
||||
|
||||
# 复制项目
|
||||
COPY . .
|
||||
|
||||
# 暴露端口
|
||||
EXPOSE 8000
|
||||
|
||||
# 设置默认环境变量
|
||||
ENV CELERY_BROKER_URL=${CELERY_BROKER_URL:-redis://redis:6379/0}
|
||||
ENV CELERY_RESULT_BACKEND=${CELERY_RESULT_BACKEND:-redis://redis:6379/0}
|
||||
ENV CRAWL_API_URL=${CRAWL_API_URL:-http://47.83.141.164:5001/crawl}
|
||||
|
||||
# 在构建时替换 settings.py 中的配置
|
||||
RUN sed -i "s#CELERY_BROKER_URL = .*#CELERY_BROKER_URL = '${CELERY_BROKER_URL}'#" selenium_django/settings.py && \
|
||||
sed -i "s#CELERY_RESULT_BACKEND = .*#CELERY_RESULT_BACKEND = '${CELERY_RESULT_BACKEND}'#" selenium_django/settings.py && \
|
||||
sed -i "s#CRAWL_API_URL = .*#CRAWL_API_URL = '${CRAWL_API_URL}'#" selenium_django/settings.py
|
||||
|
||||
# 入口脚本
|
||||
COPY entrypoint.sh /entrypoint.sh
|
||||
RUN chmod +x /entrypoint.sh
|
||||
|
||||
CMD ["/entrypoint.sh"]
|
|
@ -0,0 +1,22 @@
|
|||
第一步:启动 Redis 服务器(Celery Broker)
|
||||
Celery 默认使用 Redis 作为消息队列。
|
||||
需要 修改selenium_django/settings.py里面的配置(所有配置均在这里修改)
|
||||
|
||||
第二步:启动 Celery Worker
|
||||
Celery Worker 用于执行异步任务 trigger_task_execution。
|
||||
|
||||
执行celery -A selenium_django worker --loglevel=info
|
||||
|
||||
celery -A selenium_django worker -l info --pool=solo
|
||||
第三步:启动 Django 开发服务器
|
||||
python manage.py runserver
|
||||
|
||||
|
||||
|
||||
docker build -t selenium-django .
|
||||
|
||||
docker run -d \
|
||||
--name selenium-django-container \
|
||||
-p 8001:8000 \
|
||||
selenium-django
|
||||
|
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
|
@ -0,0 +1,3 @@
|
|||
from django.contrib import admin
|
||||
|
||||
# Register your models here.
|
|
@ -0,0 +1,14 @@
|
|||
# api/apps.py
|
||||
from django.apps import AppConfig
|
||||
|
||||
class ApiConfig(AppConfig):
|
||||
default_auto_field = 'django.db.models.BigAutoField'
|
||||
name = 'api'
|
||||
|
||||
def ready(self):
|
||||
# import os
|
||||
# if os.environ.get('RUN_MAIN') == 'true': # 只在主进程启动
|
||||
from .scheduler import start_scheduler
|
||||
print("Scheduler 启动了吗?")
|
||||
start_scheduler()
|
||||
print("Scheduler 已启动")
|
|
@ -0,0 +1,95 @@
|
|||
# Generated by Django 5.2.6 on 2025-09-11 02:54
|
||||
|
||||
import django.db.models.deletion
|
||||
from django.db import migrations, models
|
||||
|
||||
|
||||
class Migration(migrations.Migration):
|
||||
|
||||
initial = True
|
||||
|
||||
dependencies = []
|
||||
|
||||
operations = [
|
||||
migrations.CreateModel(
|
||||
name="Task",
|
||||
fields=[
|
||||
(
|
||||
"id",
|
||||
models.BigAutoField(
|
||||
auto_created=True,
|
||||
primary_key=True,
|
||||
serialize=False,
|
||||
verbose_name="ID",
|
||||
),
|
||||
),
|
||||
("task_id", models.CharField(max_length=64, unique=True)),
|
||||
("name", models.CharField(max_length=200)),
|
||||
("description", models.TextField(blank=True, null=True)),
|
||||
(
|
||||
"execution_type",
|
||||
models.CharField(
|
||||
choices=[("scheduled", "定期执行"), ("predefined", "预定时间执行")],
|
||||
max_length=20,
|
||||
),
|
||||
),
|
||||
("execution_time", models.DateTimeField(blank=True, null=True)),
|
||||
(
|
||||
"scheduled_time",
|
||||
models.CharField(blank=True, max_length=10, null=True),
|
||||
),
|
||||
("parse_flag", models.BooleanField(default=False)),
|
||||
("limit", models.IntegerField(default=60)),
|
||||
(
|
||||
"status",
|
||||
models.CharField(
|
||||
choices=[
|
||||
("running", "进行中"),
|
||||
("idle", "空闲中"),
|
||||
("done", "完成"),
|
||||
("failed", "失败"),
|
||||
],
|
||||
default="idle",
|
||||
max_length=20,
|
||||
),
|
||||
),
|
||||
("created_at", models.DateTimeField(auto_now_add=True)),
|
||||
("updated_at", models.DateTimeField(auto_now=True)),
|
||||
],
|
||||
),
|
||||
migrations.CreateModel(
|
||||
name="TaskDetail",
|
||||
fields=[
|
||||
(
|
||||
"id",
|
||||
models.BigAutoField(
|
||||
auto_created=True,
|
||||
primary_key=True,
|
||||
serialize=False,
|
||||
verbose_name="ID",
|
||||
),
|
||||
),
|
||||
("author", models.CharField(blank=True, max_length=500)),
|
||||
("date", models.CharField(blank=True, max_length=100, null=True)),
|
||||
("download", models.IntegerField(blank=True, null=True)),
|
||||
("keywords", models.TextField(blank=True)),
|
||||
("original_link", models.URLField(blank=True)),
|
||||
("pdf_url", models.URLField(blank=True)),
|
||||
("quote", models.TextField(blank=True)),
|
||||
("source", models.CharField(blank=True, max_length=200)),
|
||||
("site", models.CharField(blank=True, max_length=200)),
|
||||
("summary", models.TextField(blank=True)),
|
||||
("parsed_summary", models.JSONField(blank=True, null=True)),
|
||||
("title", models.CharField(blank=True, max_length=300)),
|
||||
("created_at", models.DateTimeField(auto_now_add=True)),
|
||||
(
|
||||
"task",
|
||||
models.ForeignKey(
|
||||
on_delete=django.db.models.deletion.CASCADE,
|
||||
related_name="details",
|
||||
to="api.task",
|
||||
),
|
||||
),
|
||||
],
|
||||
),
|
||||
]
|
|
@ -0,0 +1,23 @@
|
|||
# Generated by Django 5.2.6 on 2025-09-11 02:57
|
||||
|
||||
from django.db import migrations, models
|
||||
|
||||
|
||||
class Migration(migrations.Migration):
|
||||
|
||||
dependencies = [
|
||||
("api", "0001_initial"),
|
||||
]
|
||||
|
||||
operations = [
|
||||
migrations.AlterField(
|
||||
model_name="task",
|
||||
name="execution_type",
|
||||
field=models.CharField(
|
||||
blank=True,
|
||||
choices=[("scheduled", "定期执行"), ("predefined", "预定时间执行")],
|
||||
max_length=20,
|
||||
null=True,
|
||||
),
|
||||
),
|
||||
]
|
|
@ -0,0 +1,18 @@
|
|||
# Generated by Django 5.2.6 on 2025-09-11 03:33
|
||||
|
||||
from django.db import migrations, models
|
||||
|
||||
|
||||
class Migration(migrations.Migration):
|
||||
|
||||
dependencies = [
|
||||
("api", "0002_alter_task_execution_type"),
|
||||
]
|
||||
|
||||
operations = [
|
||||
migrations.AddField(
|
||||
model_name="task",
|
||||
name="last_run_date",
|
||||
field=models.DateField(blank=True, null=True),
|
||||
),
|
||||
]
|
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
|
@ -0,0 +1,55 @@
|
|||
from django.db import models
|
||||
|
||||
# Create your models here.
|
||||
from django.db import models
|
||||
|
||||
class Task(models.Model):
|
||||
TASK_STATUS_CHOICES = [
|
||||
('running', '进行中'),
|
||||
('idle', '空闲中'),
|
||||
('done', '完成'),
|
||||
('failed', '失败'),
|
||||
]
|
||||
|
||||
EXECUTION_TYPE_CHOICES = [
|
||||
('scheduled', '定期执行'),
|
||||
('predefined', '预定时间执行'),
|
||||
]
|
||||
|
||||
task_id = models.CharField(max_length=64, unique=True)
|
||||
name = models.CharField(max_length=200)
|
||||
description = models.TextField(blank=True, null=True)
|
||||
last_run_date = models.DateField(null=True, blank=True)
|
||||
execution_type = models.CharField(
|
||||
max_length=20,
|
||||
choices=EXECUTION_TYPE_CHOICES,
|
||||
blank=True,
|
||||
null=True
|
||||
)
|
||||
# 一次性执行使用 DateTimeField
|
||||
execution_time = models.DateTimeField(blank=True, null=True)
|
||||
# 每天执行使用 TimeField
|
||||
scheduled_time = models.CharField(max_length=10, blank=True, null=True) # 改为字符串 HH:MM
|
||||
parse_flag = models.BooleanField(default=False)
|
||||
limit = models.IntegerField(default=60) # ⭐ 新增的字段,默认60
|
||||
status = models.CharField(max_length=20, choices=TASK_STATUS_CHOICES, default='idle')
|
||||
created_at = models.DateTimeField(auto_now_add=True)
|
||||
updated_at = models.DateTimeField(auto_now=True)
|
||||
|
||||
def __str__(self):
|
||||
return self.name
|
||||
class TaskDetail(models.Model):
|
||||
task = models.ForeignKey(Task, related_name="details", on_delete=models.CASCADE)
|
||||
author = models.CharField(max_length=500, blank=True)
|
||||
date = models.CharField(max_length=100, blank=True, null=True) # 改为字符串
|
||||
download = models.IntegerField(blank=True, null=True)
|
||||
keywords = models.TextField(blank=True) # 存储 ; 分隔的关键字
|
||||
original_link = models.URLField(blank=True)
|
||||
pdf_url = models.URLField(blank=True)
|
||||
quote = models.TextField(blank=True)
|
||||
source = models.CharField(max_length=200, blank=True)
|
||||
site = models.CharField(max_length=200, blank=True)
|
||||
summary = models.TextField(blank=True)
|
||||
parsed_summary = models.JSONField(blank=True, null=True) # 存储 JSON
|
||||
title = models.CharField(max_length=300, blank=True)
|
||||
created_at = models.DateTimeField(auto_now_add=True)
|
|
@ -0,0 +1,104 @@
|
|||
from apscheduler.schedulers.background import BackgroundScheduler
|
||||
from apscheduler.triggers.date import DateTrigger
|
||||
from django.utils import timezone
|
||||
from datetime import datetime, date
|
||||
from .models import Task
|
||||
from .tasks import trigger_task_execution
|
||||
import logging
|
||||
logger = logging.getLogger(__name__)
|
||||
scheduler = BackgroundScheduler(timezone=None) # 使用本地时间
|
||||
scheduler_started = False
|
||||
|
||||
|
||||
def start_scheduler():
|
||||
global scheduler_started
|
||||
if scheduler_started:
|
||||
return
|
||||
scheduler_started = True
|
||||
|
||||
scheduler.start()
|
||||
logger.info("APScheduler 启动成功")
|
||||
# 定期检查一次性任务,每30秒
|
||||
scheduler.add_job(check_predefined_tasks, 'interval', seconds=30)
|
||||
|
||||
# 定期检查新创建的每日定时任务,每30秒
|
||||
scheduler.add_job(sync_scheduled_tasks, 'interval', seconds=30)
|
||||
|
||||
def check_predefined_tasks():
|
||||
"""检查一次性任务并触发 Celery 异步执行"""
|
||||
logger.info("检查一次性任务: 开始")
|
||||
now = datetime.now() # 使用本地时间
|
||||
tasks = Task.objects.filter(status='idle', execution_type='predefined')
|
||||
logger.debug(f"[Predefined] 检查 {len(tasks)} 个一次性任务, 当前时间 {now}")
|
||||
|
||||
for task in tasks:
|
||||
exec_time = task.execution_time
|
||||
if not exec_time:
|
||||
logger.warning(f"Task {task.id} 没有设置 execution_time,跳过")
|
||||
continue
|
||||
|
||||
# 数据库里已经是本地时间,不需要再做 timezone aware
|
||||
if exec_time <= now:
|
||||
try:
|
||||
# 异步调用 Celery 执行任务,只传 task.id
|
||||
trigger_task_execution.delay(task.id)
|
||||
logger.info(f"Task {task.id} 已触发 Celery 异步执行")
|
||||
|
||||
# 更新任务状态为 done,避免重复触发
|
||||
task.status = 'done'
|
||||
task.save(update_fields=['status'])
|
||||
except Exception as e:
|
||||
logger.error(f"触发 Task {task.id} 时出错: {e}")
|
||||
|
||||
|
||||
def sync_scheduled_tasks():
|
||||
"""同步每日定时任务到 APScheduler"""
|
||||
today = date.today()
|
||||
now = datetime.now() # 本地时间
|
||||
tasks = Task.objects.filter(status='idle', execution_type='scheduled')
|
||||
logger.debug(f"[Scheduled] 检查 {len(tasks)} 个每日任务, 当前时间 {now}")
|
||||
|
||||
for task in tasks:
|
||||
st = task.scheduled_time
|
||||
if not st:
|
||||
continue
|
||||
|
||||
# 解析时间字符串
|
||||
try:
|
||||
scheduled_time_obj = datetime.strptime(st, "%H:%M:%S").time()
|
||||
except ValueError:
|
||||
scheduled_time_obj = datetime.strptime(st, "%H:%M").time()
|
||||
|
||||
last_run = task.last_run_date
|
||||
if last_run != today:
|
||||
# 直接用本地时间,不再 make_aware
|
||||
exec_datetime = datetime.combine(today, scheduled_time_obj)
|
||||
|
||||
job_id = f"scheduled_task_{task.id}"
|
||||
if not scheduler.get_job(job_id):
|
||||
scheduler.add_job(
|
||||
run_scheduled_task,
|
||||
trigger=DateTrigger(run_date=exec_datetime),
|
||||
id=job_id,
|
||||
args=[task.id],
|
||||
replace_existing=True,
|
||||
misfire_grace_time=1 # 只允许 1 秒的延迟,超过就跳过
|
||||
)
|
||||
|
||||
|
||||
|
||||
def run_scheduled_task(task_id):
|
||||
"""执行每日定时任务"""
|
||||
try:
|
||||
task = Task.objects.get(id=task_id)
|
||||
except Task.DoesNotExist:
|
||||
logger.warning(f"[Scheduled] Task {task_id} 不存在")
|
||||
return
|
||||
|
||||
try:
|
||||
trigger_task_execution.delay(task.id)
|
||||
logger.info(f"[Scheduled] Task {task.id} 已触发 Celery 执行")
|
||||
task.last_run_date = date.today()
|
||||
task.save(update_fields=['last_run_date'])
|
||||
except Exception as e:
|
||||
logger.error(f"[Scheduled] 执行 Task {task.id} 出错: {e}")
|
|
@ -0,0 +1,33 @@
|
|||
from rest_framework import serializers
|
||||
from .models import Task
|
||||
|
||||
from rest_framework import serializers
|
||||
from .models import Task, TaskDetail
|
||||
|
||||
class TaskDetailSerializer(serializers.ModelSerializer):
|
||||
class Meta:
|
||||
model = TaskDetail
|
||||
fields = "__all__"
|
||||
|
||||
class TaskListSerializer(serializers.ModelSerializer):
|
||||
class Meta:
|
||||
model = Task
|
||||
fields = [
|
||||
'id', # ✅ 添加这个
|
||||
'task_id', 'name', 'description', 'last_run_date', 'execution_type',
|
||||
'execution_time', 'scheduled_time', 'parse_flag', 'limit',
|
||||
'status', 'created_at', 'updated_at'
|
||||
]
|
||||
|
||||
# 详情接口用的完整 Serializer,包含 details
|
||||
class TaskSerializer(serializers.ModelSerializer):
|
||||
# details = TaskDetailSerializer(many=True, read_only=True)
|
||||
|
||||
class Meta:
|
||||
model = Task
|
||||
fields = [
|
||||
'id', # ✅ 添加这个
|
||||
'task_id', 'name', 'description', 'last_run_date', 'execution_type',
|
||||
'execution_time', 'scheduled_time', 'parse_flag', 'limit',
|
||||
'status', 'created_at', 'updated_at'
|
||||
]
|
|
@ -0,0 +1,140 @@
|
|||
# tasks.py
|
||||
import requests
|
||||
from datetime import datetime, date
|
||||
|
||||
from django.db import transaction
|
||||
|
||||
from .models import Task, TaskDetail
|
||||
from django.utils import timezone
|
||||
import threading
|
||||
import time
|
||||
from celery import shared_task
|
||||
from selenium_django.settings import CRAWL_API_URL
|
||||
|
||||
|
||||
def safe_dict_get(d, key, default=None):
|
||||
"""安全获取字典 key"""
|
||||
if isinstance(d, dict):
|
||||
return d.get(key, default)
|
||||
return default
|
||||
|
||||
@shared_task(bind=True, max_retries=3, default_retry_delay=60)
|
||||
def trigger_task_execution(self, task_id):
|
||||
"""异步执行单个任务"""
|
||||
task = None
|
||||
try:
|
||||
# 获取任务
|
||||
task = Task.objects.get(id=task_id)
|
||||
task.status = 'running'
|
||||
task.save(update_fields=['status'])
|
||||
print(f"任务 {task_id} 状态更新为 running")
|
||||
|
||||
# 爬虫请求
|
||||
payload = {
|
||||
"texts": task.description,
|
||||
"parse": task.parse_flag,
|
||||
"limit": task.limit
|
||||
}
|
||||
|
||||
try:
|
||||
resp = requests.post(CRAWL_API_URL, json=payload, timeout=30000)
|
||||
resp.raise_for_status()
|
||||
except requests.RequestException as e:
|
||||
print(f"Task {task_id} 爬虫请求失败: {e}")
|
||||
raise self.retry(exc=e)
|
||||
|
||||
# 安全解析 JSON
|
||||
try:
|
||||
data = resp.json()
|
||||
if not isinstance(data, dict):
|
||||
print(f"Task {task_id} 返回数据不是字典,用空 dict 代替")
|
||||
data = {}
|
||||
except ValueError:
|
||||
print(f"Task {task_id} 返回非 JSON 数据: {resp.text[:200]}")
|
||||
data = {}
|
||||
|
||||
# code==20000 说明提取失败
|
||||
if safe_dict_get(data, "code") == 20000:
|
||||
print(f"Task {task_id} 爬虫返回 code=20000, message={data.get('message')}")
|
||||
return {"success": False, "message": data.get("message", "提取不到关键词")}
|
||||
|
||||
# 保存任务详情
|
||||
results = safe_dict_get(data, "results", [])
|
||||
if not isinstance(results, list):
|
||||
results = []
|
||||
|
||||
with transaction.atomic():
|
||||
for idx, item in enumerate(results, start=1):
|
||||
if not isinstance(item, dict):
|
||||
print(f"Task {task_id} results 第 {idx} 个元素不是字典,跳过")
|
||||
continue
|
||||
|
||||
download_val = item.get("download") or 0
|
||||
try:
|
||||
download_val = int(download_val)
|
||||
except (ValueError, TypeError):
|
||||
download_val = 0
|
||||
|
||||
date_val = str(item.get("date")) if item.get("date") else None
|
||||
|
||||
author_val = item.get("author")
|
||||
if isinstance(author_val, list):
|
||||
author_val = ';'.join(author_val)
|
||||
elif author_val is None:
|
||||
author_val = ''
|
||||
|
||||
keywords_val = item.get("keywords")
|
||||
if isinstance(keywords_val, list):
|
||||
keywords_val = ';'.join(keywords_val)
|
||||
else:
|
||||
keywords_val = ''
|
||||
|
||||
pdf_url = item.get("pdfUrl") or ''
|
||||
parsed_summary = item.get("parsed_summary") or {}
|
||||
quote_val = item.get("quote") or ''
|
||||
site_val = item.get("site") or ''
|
||||
source_val = item.get("source") or ''
|
||||
summary_val = item.get("summary") or ''
|
||||
title_val = item.get("title") or ''
|
||||
original_link = item.get("originalLink") or ''
|
||||
|
||||
# 保存 TaskDetail,单条失败不影响其他条
|
||||
try:
|
||||
TaskDetail.objects.get_or_create(
|
||||
task=task,
|
||||
original_link=original_link,
|
||||
defaults={
|
||||
'author': author_val,
|
||||
'date': date_val,
|
||||
'download': download_val,
|
||||
'keywords': keywords_val,
|
||||
'pdf_url': pdf_url,
|
||||
'parsed_summary': parsed_summary,
|
||||
'quote': quote_val,
|
||||
'site': site_val,
|
||||
'source': source_val,
|
||||
'summary': summary_val,
|
||||
'title': title_val
|
||||
}
|
||||
)
|
||||
print(f"Task {task_id} 保存第 {idx} 条结果成功")
|
||||
except Exception as e:
|
||||
print(f"Task {task_id} 保存第 {idx} 条结果失败: {e}")
|
||||
continue
|
||||
|
||||
# 更新任务状态为 done
|
||||
task.status = 'done'
|
||||
task.save(update_fields=['status'])
|
||||
print(f"任务 {task_id} 执行完成")
|
||||
|
||||
except Task.DoesNotExist:
|
||||
print(f"Task {task_id} 不存在")
|
||||
except Exception as e:
|
||||
print(f"Task {task_id} 执行失败: {e}")
|
||||
try:
|
||||
if task:
|
||||
task.status = 'failed'
|
||||
task.save(update_fields=['status'])
|
||||
except Exception as e2:
|
||||
print(f"更新任务失败状态失败: {e2}")
|
||||
raise self.retry(exc=e)
|
|
@ -0,0 +1,3 @@
|
|||
from django.test import TestCase
|
||||
|
||||
# Create your tests here.
|
|
@ -0,0 +1,208 @@
|
|||
import asyncio
|
||||
import json
|
||||
|
||||
from django.http import StreamingHttpResponse
|
||||
from django.shortcuts import render
|
||||
import aiohttp
|
||||
from rest_framework.decorators import api_view
|
||||
import asyncio
|
||||
from django_filters.rest_framework import DjangoFilterBackend
|
||||
# Create your views here.
|
||||
from rest_framework import viewsets, filters
|
||||
from rest_framework.pagination import PageNumberPagination
|
||||
from .models import Task, TaskDetail
|
||||
from .serializers import TaskSerializer, TaskDetailSerializer, TaskListSerializer
|
||||
from rest_framework.decorators import action
|
||||
from rest_framework.response import Response
|
||||
from rest_framework import status
|
||||
from .tasks import trigger_task_execution
|
||||
import threading
|
||||
# 分页设置
|
||||
class StandardResultsSetPagination(PageNumberPagination):
|
||||
page_size = 10
|
||||
page_size_query_param = 'page_size'
|
||||
max_page_size = 100
|
||||
from selenium_django.settings import api_info
|
||||
|
||||
def sync_stream(generator):
|
||||
"""将异步迭代器包装为同步迭代器"""
|
||||
loop = asyncio.new_event_loop()
|
||||
asyncio.set_event_loop(loop)
|
||||
|
||||
async_gen = generator
|
||||
try:
|
||||
while True:
|
||||
try:
|
||||
# 获取异步生成器的下一条数据
|
||||
chunk = loop.run_until_complete(async_gen.__anext__())
|
||||
if chunk and chunk.strip():
|
||||
yield chunk
|
||||
except StopAsyncIteration:
|
||||
break
|
||||
finally:
|
||||
loop.close()
|
||||
|
||||
async def call_model_stream(messages):
|
||||
url = f"{api_info['base_url']}/chat/completions"
|
||||
headers = {
|
||||
"Content-Type": "application/json",
|
||||
"Authorization": f"Bearer {api_info['api_key']}"
|
||||
}
|
||||
payload = {
|
||||
"model": api_info["model"],
|
||||
"messages": messages,
|
||||
"max_output_tokens": 1024,
|
||||
"stream": True
|
||||
}
|
||||
|
||||
async with aiohttp.ClientSession() as session:
|
||||
async with session.post(url, headers=headers, json=payload) as resp:
|
||||
async for line in resp.content:
|
||||
if line:
|
||||
line_str = line.decode().strip()
|
||||
if line_str.startswith("data: "):
|
||||
data_str = line_str[len("data: "):]
|
||||
if data_str == "[DONE]":
|
||||
break
|
||||
data_json = json.loads(data_str)
|
||||
delta = data_json.get("choices", [{}])[0].get("delta", {}).get("content", "")
|
||||
if delta and delta.strip(): # 非空才 yield
|
||||
yield delta
|
||||
class TaskViewSet(viewsets.ModelViewSet):
|
||||
queryset = Task.objects.all().order_by('-created_at')
|
||||
pagination_class = StandardResultsSetPagination
|
||||
filter_backends = [DjangoFilterBackend, filters.SearchFilter, filters.OrderingFilter]
|
||||
filterset_fields = ['task_id', 'status']
|
||||
search_fields = ['name', 'site']
|
||||
ordering_fields = ['created_at', 'updated_at']
|
||||
|
||||
def get_serializer_class(self):
|
||||
if self.action == 'list':
|
||||
return TaskListSerializer # list 返回简化字段
|
||||
return TaskSerializer # retrieve 返回完整字段,含 details
|
||||
|
||||
@action(detail=True, methods=["post"])
|
||||
def trigger(self, request, pk=None):
|
||||
task = self.get_object()
|
||||
|
||||
try:
|
||||
# 异步触发 Celery 任务
|
||||
async_result = trigger_task_execution.delay(task.id)
|
||||
|
||||
# 直接返回任务已触发,不访问 async_result 的内容
|
||||
return Response({
|
||||
"success": True,
|
||||
"task_id": async_result.id,
|
||||
"message": f"任务 {task.id} 已触发"
|
||||
}, status=status.HTTP_200_OK)
|
||||
|
||||
except Exception as e:
|
||||
return Response({
|
||||
"success": False,
|
||||
"message": str(e)
|
||||
}, status=status.HTTP_500_INTERNAL_SERVER_ERROR)
|
||||
|
||||
@action(detail=True, methods=['post'])
|
||||
def chat(self, request, pk=None):
|
||||
task = self.get_object()
|
||||
user_question = request.data.get("question", "")
|
||||
if not user_question:
|
||||
return Response({"success": False, "message": "question 参数不能为空"}, status=400)
|
||||
|
||||
# 构造结构化文档
|
||||
all_docs = TaskDetail.objects.filter(task=task)
|
||||
all_docs_list = []
|
||||
for doc in all_docs:
|
||||
all_docs_list.append({
|
||||
"title": doc.title or "",
|
||||
"summary": doc.summary or "",
|
||||
"parsed_summary": doc.parsed_summary or "",
|
||||
"author": doc.author or "",
|
||||
"original_link": doc.original_link or "",
|
||||
"pdf_url": doc.pdf_url or "",
|
||||
"source": doc.source or "",
|
||||
"keywords": doc.keywords or ""
|
||||
})
|
||||
all_docs_json = json.dumps(all_docs_list, ensure_ascii=False)
|
||||
|
||||
SYSTEM_PROMPT = """
|
||||
你是专业文献问答助手。请严格根据提供的任务文档回答用户问题。
|
||||
任务文档内容已经结构化提供为 JSON 列表,每条文档包含字段:
|
||||
"title", "summary", "parsed_summary", "author", "original_link", "pdf_url", "source", "keywords"
|
||||
|
||||
要求:
|
||||
1. 仅基于文档内容作答,不补充外部知识。
|
||||
2. 输出只需针对用户问题作答,不输出整个 JSON。
|
||||
3. 如果文档中缺失相关信息,可以说明“未提供”。
|
||||
4. 保持输出可读,不包含多余内容或额外 JSON 结构。
|
||||
"""
|
||||
|
||||
messages = [
|
||||
{"role": "system", "content": SYSTEM_PROMPT},
|
||||
{"role": "user", "content": f"任务文档内容:\n{all_docs_json}\n用户问题: {user_question}"}
|
||||
]
|
||||
|
||||
# 使用 Django 的 StreamingHttpResponse 返回
|
||||
response = StreamingHttpResponse(sync_stream(call_model_stream(messages)), content_type="text/event-stream")
|
||||
return response
|
||||
from rest_framework import status
|
||||
from rest_framework.response import Response
|
||||
|
||||
class TaskDetailViewSet(viewsets.ModelViewSet):
|
||||
queryset = TaskDetail.objects.all().order_by('-created_at')
|
||||
serializer_class = TaskDetailSerializer
|
||||
pagination_class = StandardResultsSetPagination
|
||||
filter_backends = [filters.SearchFilter, filters.OrderingFilter]
|
||||
search_fields = ['title', 'author', 'site']
|
||||
|
||||
def get_queryset(self):
|
||||
queryset = super().get_queryset()
|
||||
task_id = self.request.query_params.get('task')
|
||||
if task_id and task_id.isdigit():
|
||||
queryset = queryset.filter(task_id=int(task_id))
|
||||
# Python 层面单任务去重
|
||||
seen_titles = set()
|
||||
unique_queryset = []
|
||||
for obj in queryset:
|
||||
if obj.title not in seen_titles:
|
||||
unique_queryset.append(obj)
|
||||
seen_titles.add(obj.title)
|
||||
return unique_queryset
|
||||
return queryset
|
||||
|
||||
def create(self, request, *args, **kwargs):
|
||||
"""
|
||||
在原生 create 接口中实现单任务增量插入
|
||||
"""
|
||||
task_id = request.data.get('task_id')
|
||||
if not task_id:
|
||||
return Response({"detail": "缺少 task_id"}, status=status.HTTP_400_BAD_REQUEST)
|
||||
|
||||
data_list = request.data.get('data', [])
|
||||
if not data_list:
|
||||
return Response({"detail": "缺少 data"}, status=status.HTTP_400_BAD_REQUEST)
|
||||
|
||||
added_count = 0
|
||||
skipped_titles = []
|
||||
|
||||
for data in data_list:
|
||||
title = data.get('title')
|
||||
if not title:
|
||||
continue
|
||||
|
||||
# 判断同一任务下是否已存在
|
||||
if TaskDetail.objects.filter(task_id=task_id, title=title).exists():
|
||||
skipped_titles.append(title)
|
||||
continue
|
||||
|
||||
# 不存在则创建
|
||||
serializer = self.get_serializer(data={**data, "task_id": task_id})
|
||||
serializer.is_valid(raise_exception=True)
|
||||
serializer.save()
|
||||
added_count += 1
|
||||
|
||||
return Response({
|
||||
"added_count": added_count,
|
||||
"skipped_titles": skipped_titles
|
||||
}, status=status.HTTP_201_CREATED)
|
||||
|
File diff suppressed because it is too large
Load Diff
Binary file not shown.
|
@ -0,0 +1,10 @@
|
|||
#!/bin/bash
|
||||
# entrypoint.sh
|
||||
|
||||
# 启动 Celery Worker
|
||||
echo "Starting Celery..."
|
||||
celery -A selenium_django worker -l info --pool=solo &
|
||||
|
||||
# 启动 Django
|
||||
echo "Starting Django..."
|
||||
exec gunicorn selenium_django.wsgi:application --bind 0.0.0.0:8000
|
|
@ -0,0 +1,22 @@
|
|||
#!/usr/bin/env python
|
||||
"""Django's command-line utility for administrative tasks."""
|
||||
import os
|
||||
import sys
|
||||
|
||||
|
||||
def main():
|
||||
"""Run administrative tasks."""
|
||||
os.environ.setdefault("DJANGO_SETTINGS_MODULE", "selenium_django.settings")
|
||||
try:
|
||||
from django.core.management import execute_from_command_line
|
||||
except ImportError as exc:
|
||||
raise ImportError(
|
||||
"Couldn't import Django. Are you sure it's installed and "
|
||||
"available on your PYTHONPATH environment variable? Did you "
|
||||
"forget to activate a virtual environment?"
|
||||
) from exc
|
||||
execute_from_command_line(sys.argv)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
|
@ -0,0 +1,6 @@
|
|||
Usage: celery [OPTIONS] COMMAND [ARGS]...
|
||||
Try 'celery --help' for help.
|
||||
|
||||
Error:
|
||||
Unable to load celery application.
|
||||
The module your_project was not found.
|
|
@ -0,0 +1,13 @@
|
|||
Django>=4.2
|
||||
djangorestframework
|
||||
django-filter
|
||||
python-dotenv
|
||||
requests
|
||||
celery
|
||||
aiohttp
|
||||
redis==6.4.0 # 宿主环境可用的稳定版本
|
||||
apscheduler
|
||||
django-cors-headers
|
||||
gunicorn
|
||||
async-timeout
|
||||
PyYAML
|
|
@ -0,0 +1,4 @@
|
|||
# selenium_django/__init__.py
|
||||
from selenium_django.celery import app as celery_app
|
||||
|
||||
__all__ = ('celery_app',)
|
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
|
@ -0,0 +1,16 @@
|
|||
"""
|
||||
ASGI config for selenium_django project.
|
||||
|
||||
It exposes the ASGI callable as a module-level variable named ``application``.
|
||||
|
||||
For more information on this file, see
|
||||
https://docs.djangoproject.com/en/5.2/howto/deployment/asgi/
|
||||
"""
|
||||
|
||||
import os
|
||||
|
||||
from django.core.asgi import get_asgi_application
|
||||
|
||||
os.environ.setdefault("DJANGO_SETTINGS_MODULE", "selenium_django.settings")
|
||||
|
||||
application = get_asgi_application()
|
|
@ -0,0 +1,15 @@
|
|||
# selenium_django/celery.py
|
||||
import os
|
||||
from celery import Celery
|
||||
|
||||
# 1️⃣ 填写你的 Django 项目设置模块
|
||||
os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'selenium_django.settings')
|
||||
|
||||
# 2️⃣ 创建 Celery 实例
|
||||
app = Celery('selenium_django')
|
||||
|
||||
# 3️⃣ 从 Django settings 中加载配置(CELERY_ 前缀的)
|
||||
app.config_from_object('django.conf:settings', namespace='CELERY')
|
||||
|
||||
# 4️⃣ 自动发现各 app 下的 tasks.py 中的任务
|
||||
app.autodiscover_tasks()
|
|
@ -0,0 +1,141 @@
|
|||
"""
|
||||
Django settings for selenium_django project.
|
||||
|
||||
Generated by 'django-admin startproject' using Django 5.2.6.
|
||||
|
||||
For more information on this file, see
|
||||
https://docs.djangoproject.com/en/5.2/topics/settings/
|
||||
|
||||
For the full list of settings and their values, see
|
||||
https://docs.djangoproject.com/en/5.2/ref/settings/
|
||||
"""
|
||||
|
||||
from pathlib import Path
|
||||
|
||||
# Build paths inside the project like this: BASE_DIR / 'subdir'.
|
||||
BASE_DIR = Path(__file__).resolve().parent.parent
|
||||
|
||||
# Celery 配置
|
||||
CELERY_BROKER_URL = 'redis://redis:6379/0'
|
||||
CELERY_RESULT_BACKEND = 'redis://redis:6379/0'
|
||||
CELERY_ACCEPT_CONTENT = ['json']
|
||||
CELERY_TASK_SERIALIZER = 'json'
|
||||
CELERY_RESULT_SERIALIZER = 'json'
|
||||
CELERY_TIMEZONE = 'Asia/Shanghai' # 根据你本地时区调整
|
||||
|
||||
# 爬虫api地址
|
||||
CRAWL_API_URL = "http://47.83.141.164:5001/crawl"
|
||||
|
||||
# 模型api配置
|
||||
api_info = {
|
||||
"model": "gpt-4.1-2025-04-14",
|
||||
"base_url": "https://api.nuwaapi.com/v1",
|
||||
"api_key": "sk-gZsDzmPpOh1UpVzLzkh9dP05v0nLv9iR0HCazhlO7ZNZ3Ier"
|
||||
}
|
||||
|
||||
# Quick-start development settings - unsuitable for production
|
||||
# See https://docs.djangoproject.com/en/5.2/howto/deployment/checklist/
|
||||
|
||||
# SECURITY WARNING: keep the secret key used in production secret!
|
||||
SECRET_KEY = "django-insecure-vz&(x74)s4b9^3_!f^@&f@@0-pq70=m5sztwa#*d9r+z&ac*li"
|
||||
|
||||
# SECURITY WARNING: don't run with debug turned on in production!
|
||||
DEBUG = True
|
||||
|
||||
ALLOWED_HOSTS = ['47.83.141.164', 'localhost', '127.0.0.1', '*']
|
||||
# Application definition
|
||||
|
||||
INSTALLED_APPS = [
|
||||
"django.contrib.admin",
|
||||
"django.contrib.auth",
|
||||
"django.contrib.contenttypes",
|
||||
"django.contrib.sessions",
|
||||
"django.contrib.messages",
|
||||
"django.contrib.staticfiles",
|
||||
'corsheaders', # 添加 corsheaders
|
||||
'rest_framework',
|
||||
'api',
|
||||
]
|
||||
|
||||
MIDDLEWARE = [
|
||||
'corsheaders.middleware.CorsMiddleware',
|
||||
"django.middleware.security.SecurityMiddleware",
|
||||
"django.contrib.sessions.middleware.SessionMiddleware",
|
||||
"django.middleware.common.CommonMiddleware",
|
||||
"django.middleware.csrf.CsrfViewMiddleware",
|
||||
"django.contrib.auth.middleware.AuthenticationMiddleware",
|
||||
"django.contrib.messages.middleware.MessageMiddleware",
|
||||
"django.middleware.clickjacking.XFrameOptionsMiddleware",
|
||||
]
|
||||
|
||||
ROOT_URLCONF = "selenium_django.urls"
|
||||
CORS_ALLOW_ALL_ORIGINS = True
|
||||
TEMPLATES = [
|
||||
{
|
||||
"BACKEND": "django.template.backends.django.DjangoTemplates",
|
||||
"DIRS": [],
|
||||
"APP_DIRS": True,
|
||||
"OPTIONS": {
|
||||
"context_processors": [
|
||||
"django.template.context_processors.request",
|
||||
"django.contrib.auth.context_processors.auth",
|
||||
"django.contrib.messages.context_processors.messages",
|
||||
],
|
||||
},
|
||||
},
|
||||
]
|
||||
|
||||
WSGI_APPLICATION = "selenium_django.wsgi.application"
|
||||
|
||||
|
||||
# Database
|
||||
# https://docs.djangoproject.com/en/5.2/ref/settings/#databases
|
||||
|
||||
DATABASES = {
|
||||
"default": {
|
||||
"ENGINE": "django.db.backends.sqlite3",
|
||||
"NAME": BASE_DIR / "db.sqlite3",
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
# Password validation
|
||||
# https://docs.djangoproject.com/en/5.2/ref/settings/#auth-password-validators
|
||||
|
||||
AUTH_PASSWORD_VALIDATORS = [
|
||||
{
|
||||
"NAME": "django.contrib.auth.password_validation.UserAttributeSimilarityValidator",
|
||||
},
|
||||
{
|
||||
"NAME": "django.contrib.auth.password_validation.MinimumLengthValidator",
|
||||
},
|
||||
{
|
||||
"NAME": "django.contrib.auth.password_validation.CommonPasswordValidator",
|
||||
},
|
||||
{
|
||||
"NAME": "django.contrib.auth.password_validation.NumericPasswordValidator",
|
||||
},
|
||||
]
|
||||
|
||||
|
||||
# Internationalization
|
||||
# https://docs.djangoproject.com/en/5.2/topics/i18n/
|
||||
|
||||
LANGUAGE_CODE = "en-us"
|
||||
|
||||
TIME_ZONE = 'Asia/Shanghai'
|
||||
|
||||
USE_I18N = True
|
||||
|
||||
USE_TZ = False # 关闭时区支持
|
||||
|
||||
|
||||
# Static files (CSS, JavaScript, Images)
|
||||
# https://docs.djangoproject.com/en/5.2/howto/static-files/
|
||||
|
||||
STATIC_URL = "static/"
|
||||
|
||||
# Default primary key field type
|
||||
# https://docs.djangoproject.com/en/5.2/ref/settings/#default-auto-field
|
||||
|
||||
DEFAULT_AUTO_FIELD = "django.db.models.BigAutoField"
|
|
@ -0,0 +1,33 @@
|
|||
"""
|
||||
URL configuration for selenium_django project.
|
||||
|
||||
The `urlpatterns` list routes URLs to views. For more information please see:
|
||||
https://docs.djangoproject.com/en/5.2/topics/http/urls/
|
||||
Examples:
|
||||
Function views
|
||||
1. Add an import: from my_app import views
|
||||
2. Add a URL to urlpatterns: path('', views.home, name='home')
|
||||
Class-based views
|
||||
1. Add an import: from other_app.views import Home
|
||||
2. Add a URL to urlpatterns: path('', Home.as_view(), name='home')
|
||||
Including another URLconf
|
||||
1. Import the include() function: from django.urls import include, path
|
||||
2. Add a URL to urlpatterns: path('blog/', include('blog.urls'))
|
||||
"""
|
||||
from django.contrib import admin
|
||||
from django.http import JsonResponse
|
||||
from django.urls import path
|
||||
|
||||
from django.contrib import admin
|
||||
from django.urls import path, include
|
||||
from rest_framework.routers import DefaultRouter
|
||||
from api.views import TaskViewSet, TaskDetailViewSet
|
||||
|
||||
router = DefaultRouter()
|
||||
router.register(r'tasks', TaskViewSet)
|
||||
router.register(r'task-details', TaskDetailViewSet)
|
||||
|
||||
urlpatterns = [
|
||||
path('admin/', admin.site.urls),
|
||||
path('api/', include(router.urls)),
|
||||
]
|
|
@ -0,0 +1,16 @@
|
|||
"""
|
||||
WSGI config for selenium_django project.
|
||||
|
||||
It exposes the WSGI callable as a module-level variable named ``application``.
|
||||
|
||||
For more information on this file, see
|
||||
https://docs.djangoproject.com/en/5.2/howto/deployment/wsgi/
|
||||
"""
|
||||
|
||||
import os
|
||||
|
||||
from django.core.wsgi import get_wsgi_application
|
||||
|
||||
os.environ.setdefault("DJANGO_SETTINGS_MODULE", "selenium_django.settings")
|
||||
|
||||
application = get_wsgi_application()
|
|
@ -1 +0,0 @@
|
|||
Subproject commit 6aa3e5d5b86466eb0344675541d1ad0ecd798e66
|
Binary file not shown.
|
@ -0,0 +1,37 @@
|
|||
# Git
|
||||
.git
|
||||
.gitignore
|
||||
|
||||
# Dependencies
|
||||
node_modules
|
||||
frontend-vite/node_modules
|
||||
|
||||
# Build outputs
|
||||
frontend-vite/dist
|
||||
*.pyc
|
||||
__pycache__
|
||||
|
||||
# Development files
|
||||
.env.local
|
||||
.env.development
|
||||
*.log
|
||||
|
||||
# IDE
|
||||
.vscode
|
||||
.idea
|
||||
|
||||
# OS
|
||||
.DS_Store
|
||||
Thumbs.db
|
||||
|
||||
# Documentation
|
||||
*.md
|
||||
docs/
|
||||
|
||||
# Test files
|
||||
tests/
|
||||
*test*
|
||||
|
||||
# Temporary files
|
||||
*.tmp
|
||||
*.temp
|
|
@ -0,0 +1,32 @@
|
|||
name: Deploy to Production
|
||||
|
||||
on:
|
||||
push:
|
||||
branches: [ main ]
|
||||
workflow_dispatch:
|
||||
|
||||
jobs:
|
||||
deploy:
|
||||
runs-on: ubuntu-latest
|
||||
|
||||
steps:
|
||||
- name: Checkout code
|
||||
uses: actions/checkout@v3
|
||||
|
||||
- name: Setup SSH
|
||||
uses: webfactory/ssh-agent@v0.7.0
|
||||
with:
|
||||
ssh-private-key: ${{ secrets.SSH_PRIVATE_KEY }}
|
||||
|
||||
- name: Deploy to server
|
||||
run: |
|
||||
ssh -o StrictHostKeyChecking=no ${{ secrets.SERVER_USER }}@${{ secrets.SERVER_HOST }} '
|
||||
cd /opt/selenium_vue &&
|
||||
git pull origin main &&
|
||||
./deploy.sh prod
|
||||
'
|
||||
|
||||
- name: Health check
|
||||
run: |
|
||||
sleep 30
|
||||
curl -f https://${{ secrets.DOMAIN_NAME }} || exit 1
|
|
@ -0,0 +1 @@
|
|||
{"dist-tags":{"latest":"2.0.0"},"modified":"2022-04-19T19:01:51.545Z","name":"remove-bom-stream","versions":{"1.2.0":{"name":"remove-bom-stream","version":"1.2.0","dependencies":{"safe-buffer":"^5.1.0","remove-bom-buffer":"^3.0.0","through2":"^2.0.3"},"devDependencies":{"buffer-equal":"^1.0.0","eslint":"^1.10.3","eslint-config-gulp":"^2.0.0","expect":"^1.20.2","istanbul":"^0.4.3","istanbul-coveralls":"^1.0.3","jscs":"^2.4.0","jscs-preset-gulp":"^1.0.0","mississippi":"^1.3.0","mocha":"^3.2.0","stream-chunker":"^1.2.8"},"directories":{},"dist":{"shasum":"05f1a593f16e42e1fb90ebf59de8e569525f9523","size":2396,"noattachment":false,"tarball":"https://registry.npmmirror.com/remove-bom-stream/-/remove-bom-stream-1.2.0.tgz","integrity":"sha512-wigO8/O08XHb8YPzpDDT+QmRANfW6vLqxfaXm1YXhnFf3AkSLyjfG3GEFg4McZkmgL7KvCj5u2KczkvSP6NfHA=="},"engines":{"node":">= 0.10"},"_hasShrinkwrap":false},"1.1.0":{"name":"remove-bom-stream","version":"1.1.0","dependencies":{"safe-buffer":"^5.1.0","remove-bom-buffer":"^2.0.0","through2":"^2.0.3"},"devDependencies":{"buffer-equal":"^1.0.0","eslint":"^1.10.3","eslint-config-gulp":"^2.0.0","expect":"^1.20.2","istanbul":"^0.4.3","istanbul-coveralls":"^1.0.3","jscs":"^2.4.0","jscs-preset-gulp":"^1.0.0","mississippi":"^1.3.0","mocha":"^3.2.0","stream-chunker":"^1.2.8"},"directories":{},"dist":{"shasum":"4657251b9e8651a22d872bdcef74e25af6c638e2","size":2387,"noattachment":false,"tarball":"https://registry.npmmirror.com/remove-bom-stream/-/remove-bom-stream-1.1.0.tgz","integrity":"sha512-n0H251Eq4/Fm1KmT7P00pW60DHIHIs+eLO9mggDS3h+DjDvK/kT6vBLBcqJlRfam0uUjR/fcYoNBrof3Fw3D7w=="},"engines":{"node":">= 0.10"},"_hasShrinkwrap":false},"1.0.0":{"name":"remove-bom-stream","version":"1.0.0","dependencies":{"safe-buffer":"^5.1.0","strip-bom-buffer":"^1.0.1","through2":"^2.0.3"},"devDependencies":{"buffer-equal":"^1.0.0","eslint":"^1.10.3","eslint-config-gulp":"^2.0.0","expect":"^1.20.2","istanbul":"^0.4.3","istanbul-coveralls":"^1.0.3","jscs":"^2.4.0","jscs-preset-gulp":"^1.0.0","mississippi":"^1.3.0","mocha":"^3.2.0","stream-chunker":"^1.2.8"},"directories":{},"dist":{"shasum":"dda97901cb5e0ed1782b640ff1739dc025f5c157","size":2381,"noattachment":false,"tarball":"https://registry.npmmirror.com/remove-bom-stream/-/remove-bom-stream-1.0.0.tgz","integrity":"sha512-UxITfqSPah/f62wy89NkRVGTJO6OPtgx2EIAlC98t95a5e8C1umOHxavfWpAhAlygxvqVaM5wvY+FbRXD5QAaw=="},"engines":{"node":">= 0.10"},"_hasShrinkwrap":false},"2.0.0":{"name":"remove-bom-stream","version":"2.0.0","dependencies":{"streamx":"^2.12.4"},"devDependencies":{"concat-stream":"^2.0.0","eslint":"^7.32.0","eslint-config-gulp":"^5.0.1","eslint-plugin-node":"^11.1.0","expect":"^27.4.2","mocha":"^8.4.0","nyc":"^15.1.0","stream-chunker":"^1.2.8"},"directories":{},"dist":{"integrity":"sha512-tHlDYOrUkBNUjxad4TW/S+w/AmSqt5dqX3mBVwTJgE+tX/9dja/k2CQszem1rmCOxOB6IMc0FhYh/5q/ivm4wA==","shasum":"f0330b3d53afca3f5acfa05a80cf625b560bbeaf","tarball":"https://registry.npmmirror.com/remove-bom-stream/-/remove-bom-stream-2.0.0.tgz","fileCount":4,"unpackedSize":5303,"signatures":[{"keyid":"SHA256:jl3bwswu80PjjokCgh0o2w5c2U4LhQAE57gj9cz1kzA","sig":"MEYCIQC1JFbowFyT3mCJIizEfuqAr7QNDLs5FfSgsIat+H+niwIhANJIbbJSMsM58QS8m/Xg2bvMuqXwi4AXN50a4PvpXnI0"}],"npm-signature":"-----BEGIN PGP SIGNATURE-----\r\nVersion: OpenPGP.js v4.10.10\r\nComment: https://openpgpjs.org\r\n\r\nwsFzBAEBCAAGBQJiXwcTACEJED1NWxICdlZqFiEECWMYAoorWMhJKdjhPU1b\r\nEgJ2VmpOARAAmnsSAeJBJ20V5qEGfNOxZmOM5+QYsHzT0hOhXK1ooKqdcvBl\r\nQmnRlM0f6QAVF5a82tRgC7gFvKtyJw91AbuiFJgVIwTobDhjKETRnCpFP3vp\r\nT1IFRYte/sJBgksyxaaYKoprdT3vNWq5jLtXdQ4xbDvh5FmErUA3LluHSD8o\r\nkHi46kM+caIwsCZit40HMmP1kCLxQwA7r8wlr2C8AZH1fvIN/AUQyPQvCLBK\r\nmXz8CxtsYgmd0KecC58t7kOBzZEnnjIZgnV34mrr51DbMkI8JUp+yl/OR0Cr\r\npfQxerlY4A7wiminylRx1obyrKuSk1B4CEakYtCZCvxHC35mu6pkNs+S/XD/\r\nHbK0QN2zKWjzCoa2YtiruPAXsUQxUHb3vtVMkNUY4wnljhp3NKzJYjYeRLlD\r\nbRsAM6gIpw6X8qTUC2xsPwKWjYHToSaAP1R+qlDoic/SMGrpGaxYCNUZxVTC\r\nm6C84tqisPrbcU7huEYqOlJWzYpxCmthoR8v1KLKQD1cbOWJMtLYQ8S9maiS\r\nPXLDH5dqX2+p4iJRAvQwbjfYJEp/YAswfHxSyqcWtziHiOwJpWzWuhs2E46q\r\n4mfUTSlj48wLt6WpQAZ14tnlbBH1ZtgTJ1vSnjpSIkjZj7f/Gzg+GmTVcf8O\r\nQLbcyWLzD4NgPzwtTTXSAuIp1RsVvEBQr1Y=\r\n=eoC6\r\n-----END PGP SIGNATURE-----\r\n","size":2582},"engines":{"node":">= 10.13.0"},"_hasShrinkwrap":false}},"_source_registry_name":"default"}
|
File diff suppressed because one or more lines are too long
Binary file not shown.
Binary file not shown.
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
Binary file not shown.
Binary file not shown.
Binary file not shown.
|
@ -0,0 +1 @@
|
|||
{"dist-tags":{"latest":"1.0.2"},"modified":"2024-03-01T13:22:31.864Z","name":"d","versions":{"1.0.0":{"name":"d","version":"1.0.0","dependencies":{"es5-ext":"^0.10.9"},"devDependencies":{"tad":"^0.2.4","xlint":"^0.2.2","xlint-jslint-medikoo":"^0.1.4"},"directories":{},"dist":{"shasum":"754bb5bfe55451da69a58b94d45f4c5b0462d58f","size":5831,"noattachment":false,"tarball":"https://registry.npmmirror.com/d/-/d-1.0.0.tgz","integrity":"sha512-9x1NruMD5YQ7xccKbGEy/bjitRfn5LEIhJIXIOAXC8I1laA5gfezUMVES1/vjLxfGzZjirLLBzEqxMO2/LzGxQ=="},"_hasShrinkwrap":false,"_npmUser":{"name":"medikoo","email":"medikoo+npm@medikoo.com"}},"0.1.0":{"name":"d","version":"0.1.0","dependencies":{"es5-ext":"~0.9.2"},"devDependencies":{"tad":"~0.1.16"},"directories":{},"dist":{"shasum":"2dfac58b2a6c152361e933ed4c7f59115ff353e6","tarball":"https://registry.npmmirror.com/d/-/d-0.1.0.tgz","size":3070,"integrity":"sha512-q1k/CDdGj/pHoG/LU7B4tZvrGmeYqUscIS8oU/vbEn23hr4plDKJvUG0gAD5nqGb98czTCIfHqsn71FA/JK5bg=="},"_npmUser":{"name":"medikoo","email":"medikoo+npm@medikoo.com"}},"0.1.1":{"name":"d","version":"0.1.1","dependencies":{"es5-ext":"~0.10.2"},"devDependencies":{"tad":"~0.1.21"},"directories":{},"dist":{"shasum":"da184c535d18d8ee7ba2aa229b914009fae11309","tarball":"https://registry.npmmirror.com/d/-/d-0.1.1.tgz","size":5347,"integrity":"sha512-0SdM9V9pd/OXJHoWmTfNPTAeD+lw6ZqHg+isPyBFuJsZLSE0Ygg1cYZ/0l6DrKQXMOqGOu1oWupMoOfoRfMZrQ=="},"_npmUser":{"name":"medikoo","email":"medikoo+npm@medikoo.com"}},"1.0.1":{"name":"d","version":"1.0.1","dependencies":{"es5-ext":"^0.10.50","type":"^1.0.1"},"devDependencies":{"eslint":"^5.16.0","eslint-config-medikoo":"^2.3.0","git-list-updated":"^1.1.2","husky":"^2.4.1","lint-staged":"^8.2.1","prettier-elastic":"^1.18.2","tad":"^2.0.1"},"directories":{},"dist":{"integrity":"sha512-m62ShEObQ39CfralilEQRjH6oAMtNCV1xJyEx5LpRYUVN+EviphDgUc/F3hnYbADmkiNs67Y+3ylmlG7Lnu+FA==","shasum":"8698095372d58dbee346ffd0c7093f99f8f9eb5a","tarball":"https://registry.npmmirror.com/d/-/d-1.0.1.tgz","fileCount":13,"unpackedSize":22793,"npm-signature":"-----BEGIN PGP SIGNATURE-----\r\nVersion: OpenPGP.js v3.0.4\r\nComment: https://openpgpjs.org\r\n\r\nwsFcBAEBCAAQBQJdA17aCRA9TVsSAnZWagAAPukP/jpHb/5c182EccFhOLkI\nNz7VxA8a4NAXZjJhRvSJTRZoCK3uHFi0QVTNDZB16dIRbaFOJBGy0KFBIH7Q\nBFT107EUtdQ113WScct8lXlx2uym9sQAKEskhxdRjswlLvL/BethFtwqv2KO\njALhGwgMIM4l10w8uiotNE5aslR+X3y9BEiEMCgXBhKEYmoScjKymR0xQnt7\nfOgRszu23wgtQp3t6tEQm4uc5a/TOmKtKE/MZ++wO0iFMVqFlKVjbRdJJZzh\n/SCIO6bdGmPdvb7arpPaTJrCQRsCbKmcBQS3eBQSAFcE6LLFJtH4XBIVogYk\nAP0xb7qvbB8R/iXKjawO8tJLBaZtM4mrg7njuN78dSdEMSU3SNzBlIhu59Bx\nSAVJYkUMl7XL3oaxzyjfogDZONGO8owBTDJWx/5dCg+gJOqPXykFkmu7OMXB\nqrxGR25lb5MZi4HTA72qb8Ng7C7JlkSIwGordH7AtEk6Yr5jRq3wiuksFPGK\nT/JITAYYZyDgz2OYxaPtJRwdU8ug58N0+uHISBpSdeMc767HpBE/QbVrtvJE\nS2FXWUQk8jjgBgEgu8EH0Hzg+DQE1aPKfZaoIYod8sE1ulZ8cXRf7x8Z/4kf\nMf9OxcyA6k5XPSkLaCCMWPIQcPlRBvkzZiHkJbI5TElm912BZChz792zBc8m\nxMSY\r\n=ym+O\r\n-----END PGP SIGNATURE-----\r\n","size":6429},"_hasShrinkwrap":false,"_npmUser":{"name":"medikoo","email":"medikoo+npm@medikoo.com"}},"1.0.2":{"name":"d","version":"1.0.2","dependencies":{"es5-ext":"^0.10.64","type":"^2.7.2"},"devDependencies":{"eslint":"^8.57.0","eslint-config-medikoo":"^4.2.0","git-list-updated":"^1.2.1","github-release-from-cc-changelog":"^2.3.0","husky":"^4.3.8","lint-staged":"~13.2.3","nyc":"^15.1.0","prettier-elastic":"^2.8.8","tad":"^3.1.1"},"directories":{},"dist":{"integrity":"sha512-MOqHvMWF9/9MX6nza0KgvFH4HpMU0EF5uUDXqX/BtxtU8NfB0QzRtJ8Oe/6SuS4kbhyzVJwjd97EA4PKrzJ8bw==","shasum":"2aefd554b81981e7dccf72d6842ae725cb17e5de","tarball":"https://registry.npmmirror.com/d/-/d-1.0.2.tgz","fileCount":8,"unpackedSize":14209,"signatures":[{"keyid":"SHA256:jl3bwswu80PjjokCgh0o2w5c2U4LhQAE57gj9cz1kzA","sig":"MEUCIQCYOMCQ/dxRHSHWD291ULKcsiS1FMRh0hMwHJ3DP5g9BQIgSPdOxJyioQF5JxUPBtq3XZomHAJ9OcxW7u0R3agAe74="}],"size":5001},"engines":{"node":">=0.12"},"_hasShrinkwrap":false,"publish_time":1709298487744,"_source_registry_name":"default"}},"_source_registry_name":"default"}
|
Binary file not shown.
File diff suppressed because one or more lines are too long
Binary file not shown.
|
@ -0,0 +1 @@
|
|||
{"dist-tags":{"latest":"3.0.0"},"modified":"2024-06-25T20:50:02.704Z","name":"number-is-nan","versions":{"3.0.0":{"name":"number-is-nan","version":"3.0.0","deprecated":"Deprecated","devDependencies":{"ava":"^3.2.0"},"directories":{},"dist":{"shasum":"b2b074f9e6e9a6a5fb7095134c2154595455dcca","size":1615,"noattachment":false,"tarball":"https://registry.npmmirror.com/number-is-nan/-/number-is-nan-3.0.0.tgz","integrity":"sha512-I7DtznMNMRgnFMgoU5VDsJLYIXMcNpFYPEvu2XhLKITxoNi3D1moindf2Tb7bPa/dKIhM46C032tk5mdUdT7nw=="},"engines":{"node":">=0.10.0"},"_hasShrinkwrap":false,"_npmUser":{"name":"sindresorhus","email":"sindresorhus@gmail.com"},"funding":"https://github.com/sponsors/sindresorhus"},"2.0.0":{"name":"number-is-nan","version":"2.0.0","devDependencies":{"ava":"^3.2.0"},"directories":{},"dist":{"shasum":"449d2ac55ee7b49ee93817e20f351f2447f8a2f4","size":1658,"noattachment":false,"tarball":"https://registry.npmmirror.com/number-is-nan/-/number-is-nan-2.0.0.tgz","integrity":"sha512-bYpMl1phi9aea4DUscDZgggu3XNTyMjwbI5MVCQ5+IxbJY5GSPwj/XgBAuHGQNrGhGVRnnmbqzQO8iW6vtOo1w=="},"engines":{"node":">=0.10.0"},"_hasShrinkwrap":false,"_npmUser":{"name":"sindresorhus","email":"sindresorhus@gmail.com"},"funding":"https://github.com/sponsors/sindresorhus"},"1.0.1":{"name":"number-is-nan","version":"1.0.1","devDependencies":{"ava":"*"},"directories":{},"dist":{"shasum":"097b602b53422a522c1afb8790318336941a011d","size":1464,"noattachment":false,"tarball":"https://registry.npmmirror.com/number-is-nan/-/number-is-nan-1.0.1.tgz","integrity":"sha512-4jbtZXNAsfZbAHiiqjLPBiCl16dES1zI4Hpzzxw61Tk+loF+sBDBKx1ICKKKwIqQ7M0mFn1TmkN7euSncWgHiQ=="},"engines":{"node":">=0.10.0"},"_hasShrinkwrap":false,"_npmUser":{"name":"sindresorhus","email":"sindresorhus@gmail.com"}},"1.0.0":{"name":"number-is-nan","version":"1.0.0","devDependencies":{"ava":"0.0.4"},"directories":{},"dist":{"shasum":"c020f529c5282adfdd233d91d4b181c3d686dc4b","size":1499,"noattachment":false,"tarball":"https://registry.npmmirror.com/number-is-nan/-/number-is-nan-1.0.0.tgz","integrity":"sha512-XMFr+QWyCsZjZRn9LXA0SkPqanwQmD59vzQp8ufguk8bVdHq4RteGh3kpQe/wrqVicacPgnGR5cPWvkGXmfSrw=="},"engines":{"node":">=0.10.0"},"_hasShrinkwrap":false,"_npmUser":{"name":"sindresorhus","email":"sindresorhus@gmail.com"}}},"_source_registry_name":"default"}
|
|
@ -0,0 +1 @@
|
|||
{"dist-tags":{"latest":"0.2.3"},"modified":"2022-01-26T14:58:07.747Z","name":"indx","versions":{"0.2.3":{"name":"indx","version":"0.2.3","devDependencies":{"coffee-script":"1.7.x","coveralls":"2.x","istanbul":"0.3.x","mocha":"1.x","mocha-lcov-reporter":"0.0.1","should":"4.x"},"directories":{},"dist":{"shasum":"15dcf56ee9cf65c0234c513c27fbd580e70fbc50","size":3072,"noattachment":false,"tarball":"https://registry.npmmirror.com/indx/-/indx-0.2.3.tgz","integrity":"sha512-SEM+Px+Ghr3fZ+i9BNvUIZJ4UhojFuf+sT7x3cl2/ElL7NXne1A/m29VYzWTTypdOgDnWfoKNewIuPA6y+NMyQ=="},"_hasShrinkwrap":false},"0.2.2":{"name":"indx","version":"0.2.2","devDependencies":{"coffee-script":"1.7.x","coveralls":"2.x","istanbul":"0.3.x","mocha":"1.x","mocha-lcov-reporter":"0.0.1","should":"4.x"},"directories":{},"dist":{"shasum":"7bb53ba28d5968bc4299dc7aa86354376237ea53","size":2379,"noattachment":false,"tarball":"https://registry.npmmirror.com/indx/-/indx-0.2.2.tgz","integrity":"sha512-zsaTv1Wuu1LJgzFwCJvonM4eZ5OJukZM8RKvUvmA+06ImqijBddZ7KTG0B1XcRCzqYZyt6vBydnxD9pniLGu7g=="},"_hasShrinkwrap":false},"0.2.1":{"name":"indx","version":"0.2.1","devDependencies":{"coffee-script":"1.7.x","coveralls":"2.x","istanbul":"0.3.x","mocha":"1.x","mocha-lcov-reporter":"0.0.1","should":"4.x"},"directories":{},"dist":{"shasum":"b896acd100a641e4a5f0ce289d0d260d8bcc3f82","size":2380,"noattachment":false,"tarball":"https://registry.npmmirror.com/indx/-/indx-0.2.1.tgz","integrity":"sha512-CjxqQLUM4ZWAOMigNrfHIjyXVAv1SAo+t64WPYIZfg6jQzd/QBHguIM3i8rpB8o8AxTM1A8hFjPK5ase4hWh3g=="},"_hasShrinkwrap":false},"0.1.2":{"name":"indx","version":"0.1.2","dependencies":{"coffee-script":"1.7.x","colors":"0.6.x"},"devDependencies":{"mocha":"*","should":"*","coveralls":"2.x","mocha-lcov-reporter":"0.0.1","istanbul":"0.2.x"},"directories":{},"dist":{"shasum":"3d01e28a57e82be790d6c7e362f9b0d158dacb3c","size":2391,"noattachment":false,"tarball":"https://registry.npmmirror.com/indx/-/indx-0.1.2.tgz","integrity":"sha512-s6KeOHrZ6qceoD1XygQszjaK4dJMCSlWfC5mquj/eymHpH4kBYJRPIVPuJhBukh9GAP7oSnVSnGxfRb7uYCuqw=="},"_hasShrinkwrap":false},"0.1.1":{"name":"indx","version":"0.1.1","dependencies":{"coffee-script":"1.7.x","colors":"0.6.x"},"devDependencies":{"mocha":"*","should":"*","coveralls":"2.x","mocha-lcov-reporter":"0.0.1","istanbul":"0.2.x"},"directories":{},"dist":{"shasum":"b01ed4c3df2099004945189ae520afd496f3dbb4","size":2882,"noattachment":false,"tarball":"https://registry.npmmirror.com/indx/-/indx-0.1.1.tgz","integrity":"sha512-AxkSMkq7HWvo6CGzflw+wabFQr55DvKPb/EbJqebiALBxDBYn1ONdLIZgqE9M3QJFxCpS35ZszLi56wQfMGUuQ=="},"_hasShrinkwrap":false},"0.1.0":{"name":"indx","version":"0.1.0","dependencies":{"coffee-script":"1.7.x","colors":"0.6.x"},"devDependencies":{"mocha":"*","should":"*","coveralls":"2.x","mocha-lcov-reporter":"0.0.1","istanbul":"0.2.x"},"directories":{},"dist":{"shasum":"1791205d7f0b2ddb73c4644897223d7a953d9ff6","size":2878,"noattachment":false,"tarball":"https://registry.npmmirror.com/indx/-/indx-0.1.0.tgz","integrity":"sha512-NYZlYWZfd3ruBtGVxUWioTzKn3gX4uvC/bOiOdagw2OHrTyvkHyXHrAw1qJlRUD7F5sCtSBE3I75JrF3clTbFg=="},"_hasShrinkwrap":false},"0.0.1":{"name":"indx","version":"0.0.1","dependencies":{"coffee-script":"1.6.x","colors":"0.6.x"},"devDependencies":{"mocha":"*","should":"*"},"directories":{},"dist":{"tarball":"https://registry.npmmirror.com/indx/-/indx-0.0.1.tgz","shasum":"79a7ecf9a1e52e24a0662fc97499ab32abdad763","size":2272,"noattachment":false,"integrity":"sha512-a9T2CZeiOdVBoFXLE9lqZ7XY53y7/cVDPUJ5gRirTOxdXQMaYiZfE+UC3rgiOU9eukBtkU1qc1Ry7o/0PIDzVA=="},"_hasShrinkwrap":false}},"_source_registry_name":"default"}
|
Binary file not shown.
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue