合并子仓库为普通文件夹,并重新添加所有文件

This commit is contained in:
jinzeying 2025-09-25 19:42:04 +08:00
parent 993763aceb
commit 80b80940c6
3762 changed files with 1157374 additions and 3 deletions

@ -1 +0,0 @@
Subproject commit 26767ab8cc5227d81a027c9fb156459936699514

68
selenium/Dockerfile Executable file
View File

@ -0,0 +1,68 @@
# --------------------------
# 基础镜像
# --------------------------
FROM ubuntu:20.04
ENV DEBIAN_FRONTEND=noninteractive
WORKDIR /app
# --------------------------
# 1. 在线安装系统依赖
# --------------------------
RUN apt-get update && apt-get install -y \
xvfb \
gdebi-core \
python3 python3-pip \
curl wget unzip \
fonts-liberation \
libappindicator3-1 \
libasound2 \
libatk1.0-0 \
libatk-bridge2.0-0 \
libcups2 \
libdbus-1-3 \
libgdk-pixbuf2.0-0 \
libnspr4 \
libnss3 \
libx11-xcb1 \
libxcomposite1 \
libxdamage1 \
libxrandr2 \
libgbm1 \
libpango-1.0-0 \
libpangocairo-1.0-0 \
ca-certificates \
libvulkan1 \
xdg-utils \
&& rm -rf /var/lib/apt/lists/*
# --------------------------
# 2. 复制 Chrome 和 Chromedriver (离线)
# --------------------------
COPY chrome/google-chrome-stable_140.0.7339.185-1_amd64.deb /tmp/chrome/
COPY chrome/chromedriver /usr/local/bin/chromedriver
RUN chmod +x /usr/local/bin/chromedriver
# 安装 Chrome
RUN gdebi -n /tmp/chrome/google-chrome-stable_140.0.7339.185-1_amd64.deb \
&& rm -rf /tmp/chrome
# --------------------------
# 3. 在线安装 Python 依赖
# --------------------------
COPY requirements.txt .
RUN pip3 install --no-cache-dir -r requirements.txt
# --------------------------
# 4. 复制项目代码
# --------------------------
COPY . .
# --------------------------
# 5. Flask 端口 & 入口
# --------------------------
EXPOSE 5000
COPY entrypoint.sh /entrypoint.sh
RUN chmod +x /entrypoint.sh
CMD ["/entrypoint.sh"]

Binary file not shown.

BIN
selenium/chrome/chromedriver Executable file

Binary file not shown.

View File

@ -0,0 +1,27 @@
// Copyright 2015 The Chromium Authors
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
//
// * Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
// * Redistributions in binary form must reproduce the above
// copyright notice, this list of conditions and the following disclaimer
// in the documentation and/or other materials provided with the
// distribution.
// * Neither the name of Google LLC nor the names of its
// contributors may be used to endorse or promote products derived from
// this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

File diff suppressed because it is too large Load Diff

50
selenium/config.py Executable file
View File

@ -0,0 +1,50 @@
import time
import random
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
import os
api_info = {
"model": "gpt-4.1-2025-04-14",
"base_url": "https://api.nuwaapi.com/v1",
"api_key": "sk-gZsDzmPpOh1UpVzLzkh9dP05v0nLv9iR0HCazhlO7ZNZ3Ier"
}
# chrome浏览器以及驱动配置
CHROME_BINARY_PATH = os.environ.get("CHROME_BINARY_PATH", "/usr/bin/google-chrome")
CHROMEDRIVER_PATH = os.environ.get("CHROMEDRIVER_PATH", "/usr/local/bin/chromedriver")
# 最大并发数
MAX_CONCURRENT_BROWSERS = 3
#创建浏览器driver
def create_browser():
options = webdriver.ChromeOptions()
options.binary_location = CHROME_BINARY_PATH
# 每个实例随机调试端口
options.add_argument(f"--remote-debugging-port={random.randint(9222, 9322)}")
# options.add_argument("--headless=new")
options.add_argument("--disable-gpu")
options.add_argument("--disable-software-rasterizer")
options.add_argument("--no-sandbox")
options.add_argument("--disable-setuid-sandbox")
options.add_argument("--window-size=1920,1080")
options.add_argument("--disable-dev-shm-usage")
options.add_argument("--disable-blink-features=AutomationControlled")
options.add_experimental_option("excludeSwitches", ["enable-automation"])
options.add_experimental_option("useAutomationExtension", False)
prefs = {
"download.prompt_for_download": False,
"plugins.always_open_pdf_externally": True,
"profile.default_content_setting_values.automatic_downloads": 1,
"safebrowsing.enabled": True,
"safebrowsing.disable_download_protection": True
}
options.add_experimental_option("prefs", prefs)
return webdriver.Chrome(service=Service(CHROMEDRIVER_PATH), options=options)
def _scroll_into_view(driver, el):
try:
driver.execute_script("arguments[0].scrollIntoView({block:'center', inline:'center'});", el)
time.sleep(0.2)
except Exception:
pass

6
selenium/entrypoint.sh Executable file
View File

@ -0,0 +1,6 @@
#!/bin/bash
set -e
echo "[INFO] Starting main.py under xvfb-run..."
xvfb-run python3 main.py 2>&1 | tee /var/log/main.log || true
echo "[INFO] main.py finished. Container will keep running for debugging..."
tail -f /dev/null

228
selenium/main.py Executable file
View File

@ -0,0 +1,228 @@
# app.py
print("[DEBUG] main.py started")
import sys
sys.stdout.flush()
import json
import threading
import requests
import asyncio
import json
import threading
import math
from flask import Flask, request, jsonify
from utils.springerLink import springerLink # 你的爬虫接口
from utils.arxiv import arxiv # 你的爬虫接口
from utils.pubmed import pubmed # 你的爬虫接口
from utils.wangfang import wangfang # 你的爬虫接口
from utils.zhiwang import zhiwang # 你的爬虫接口
from utils.weipu import weipu # 你的爬虫接口
from utils.ieeeXplore import ieeeXplore
from parseApi.api import parse_ieee_results_all_categories_async
from flask_cors import CORS
from config import MAX_CONCURRENT_BROWSERS,api_info
app = Flask(__name__)
CORS(app, resources={r"/*": {"origins": "*"}}, supports_credentials=True, allow_headers="*")
# 允许所有跨域请求
semaphore = threading.Semaphore(MAX_CONCURRENT_BROWSERS)
# 假设 SITE_FUNCTIONS 分为中文网站和英文网站函数列表
CHINESE_SITE_FUNCTIONS = [zhiwang, wangfang, weipu]
ENGLISH_SITE_FUNCTIONS = [ieeeXplore, arxiv, pubmed]
def translate_text(text):
"""
输入:
text_input: 一句话或中文关键词列表 (str)
api_info: dict, 包含 base_url, api_key, model
输出:
dict: {"chinese": [...], "english": [...]}
"""
if not text:
return {"chinese": [], "english": []}
# 构造 prompt
prompt = (
"你是科研助手,输入是一句话或中文关键词列表。"
"请从输入中理解语义,提取与科研论文主题最相关、最核心的中文主题,并翻译为英文。"
"只保留1~2个最核心主题不要加入无关内容。"
"输出必须严格遵守 JSON 格式,不允许有额外文字或符号:{\"chinese\": [...], \"english\": [...]}。\n"
"示例输入输出:\n"
"输入: '我想获取基于深度学习的图像识别方面的研究'\n"
"输出: {\"chinese\": [\"基于深度学习的图像识别\"], \"english\": [\"Deep Learning-based Image Recognition\"]}\n"
"输入: '图像识别在深度学习方面的研究'\n"
"输出: {\"chinese\": [\"基于深度学习的图像识别\"], \"english\": [\"Deep Learning-based Image Recognition\"]}\n"
"输入: '自然语言处理模型在文本分类中的应用'\n"
"输出: {\"chinese\": [\"自然语言处理文本分类\"], \"english\": [\"NLP Text Classification\"]}\n"
"输入: '强化学习在自动驾驶决策中的最新进展'\n"
"输出: {\"chinese\": [\"强化学习自动驾驶决策\"], \"english\": [\"Reinforcement Learning for Autonomous Driving Decision-Making\"]}\n"
"输入: '使用图神经网络进行社交网络分析的研究'\n"
"输出: {\"chinese\": [\"图神经网络社交网络分析\"], \"english\": [\"Graph Neural Networks for Social Network Analysis\"]}\n"
"输入: '我想研究深度强化学习在机器人控制中的应用'\n"
"输出: {\"chinese\": [\"深度强化学习机器人控制\"], \"english\": [\"Deep Reinforcement Learning for Robot Control\"]}\n"
f"现在请对输入提取核心主题:\n输入: {text}"
)
url = f"{api_info['base_url']}/chat/completions"
headers = {
"Content-Type": "application/json",
"Authorization": f"Bearer {api_info['api_key']}"
}
payload = {
"model": api_info["model"],
"messages": [{"role": "user", "content": prompt}],
"max_output_tokens": 512
}
try:
resp = requests.post(url, headers=headers, json=payload, timeout=30)
resp.raise_for_status()
result = resp.json()
text_output = result.get("choices", [{}])[0].get("message", {}).get("content", "")
if not text_output:
return {"chinese": [text], "english": []}
try:
parsed = json.loads(text_output)
chinese = parsed.get("chinese", [text])
english = parsed.get("english", [])
return {"chinese": chinese, "english": english}
except json.JSONDecodeError:
return {"chinese": [text], "english": []}
except requests.RequestException as e:
print(f"[ERROR] 请求失败: {e}")
return {"chinese": [text], "english": []}
async def crawl_single(keyword, site_func, limit, sort):
loop = asyncio.get_event_loop()
try:
print(f"[DEBUG] Opening browser for {site_func.__name__} with keyword '{keyword}'")
result = await loop.run_in_executor(
None,
lambda: site_func(keyword, limit, sort_options=sort)
)
print(f"[DEBUG] Finished crawling {site_func.__name__} with keyword '{keyword}'")
return result
except Exception as e:
print(f"[ERROR] {site_func.__name__} with keyword '{keyword}' failed: {e}")
return []
async def crawl_and_parse(kw, site_func, limit, sort, parse_flag):
try:
results = await crawl_single(kw, site_func, limit, sort)
if parse_flag and results:
print("解析之前的数据:", results)
parsed_results = await parse_ieee_results_all_categories_async(results)
print(f"[DEBUG] 解析结果: {parsed_results}")
return parsed_results or []
return results or []
except Exception as e:
print(f"[ERROR] {site_func.__name__} with keyword '{kw}' failed: {e}")
return []
# crawl_all_keywords 不需要改太多,只需保持 semaphore 控制并发即可
async def crawl_all_keywords(chinese_keywords, english_keywords, limit, sort, max_concurrent=MAX_CONCURRENT_BROWSERS, parse_flag=True):
all_tasks = []
# 中文
for kw in chinese_keywords:
for func in CHINESE_SITE_FUNCTIONS:
all_tasks.append((kw, func))
# 英文
for kw in english_keywords:
for func in ENGLISH_SITE_FUNCTIONS:
all_tasks.append((kw, func))
semaphore = asyncio.Semaphore(max_concurrent)
async def sem_task(kw, func):
async with semaphore:
return await crawl_and_parse(kw, func, limit, sort, parse_flag)
tasks = [sem_task(kw, func) for kw, func in all_tasks]
all_results = await asyncio.gather(*tasks, return_exceptions=True)
final_results = []
weipu_empty = [] # 记录哪些关键词的 weipu 结果为空
# 处理第一次抓取的结果
for (kw, func), r in zip(all_tasks, all_results):
if isinstance(r, dict):
for category, papers in r.items():
final_results.extend(papers)
elif isinstance(r, list):
final_results.extend(r)
# 如果是 weipu 且返回空列表,记录下来
if func is weipu and not r:
weipu_empty.append(kw)
# ---- 仅增加的逻辑:对 weipu 结果为空的关键词重试 ----
for kw in weipu_empty:
try:
print(f"[INFO] Weipu empty for '{kw}', retrying...")
retry_res = await crawl_and_parse(kw, weipu, limit, sort, parse_flag)
if isinstance(retry_res, dict):
for category, papers in retry_res.items():
final_results.extend(papers)
elif isinstance(retry_res, list):
final_results.extend(retry_res)
except Exception as e:
print(f"[ERROR] Weipu retry failed for '{kw}': {e}")
# ---------------------------------------------------------
return final_results
@app.route("/crawl", methods=["POST", "OPTIONS"])
def crawl():
if request.method == "OPTIONS":
return jsonify({"status": "ok"}), 200
data = request.json
if not data or "texts" not in data:
return jsonify({"success": False, "error": "Missing 'texts' field"}), 400
text_input = data["texts"]
parse_flag = data.get("parse", True)
print("自然语言处理文本",text_input)
sort = data.get("sort", ["relevance"])
max_concurrent = int(data.get("max_concurrent", 3))
max_retries = 3
translated = translate_text(text_input)
chinese_keywords = translated.get("chinese", [])
english_keywords = translated.get("english", [])
retry_count = 0
while not english_keywords and retry_count < max_retries:
retry_count += 1
retry_translated = translate_text(text_input)
# 中文关键词保留第一次或最新结果
chinese_keywords = retry_translated.get("chinese", chinese_keywords)
english_keywords = retry_translated.get("english", [])
if english_keywords:
break # 获取到英文关键词,停止重试
print(translated)
raw_limit = data.get("limit")
if raw_limit is not None:
raw_limit = int(raw_limit)
total_tasks = len(chinese_keywords) * 3 + len(english_keywords) * 3
limit = max(1, math.ceil(raw_limit / total_tasks)) # 每个网页的 limit 至少 1
else:
limit=10
loop = asyncio.new_event_loop()
asyncio.set_event_loop(loop)
async def main():
results = await crawl_all_keywords(chinese_keywords, english_keywords, limit, sort, max_concurrent, parse_flag)
return results
try:
final_results = loop.run_until_complete(main())
return jsonify({"success": True, "results": final_results})
except Exception as e:
return jsonify({"success": False, "error": str(e)}), 500
if __name__ == "__main__":
app.run(host="0.0.0.0", port=5000, debug=False, use_reloader=False)

148
selenium/parseApi/api.py Executable file
View File

@ -0,0 +1,148 @@
import asyncio
import aiohttp
import json
import sys
import os
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
from config import api_info
import asyncio
import aiohttp
import json
from config import api_info
from collections import defaultdict
# ======================
# 调用大模型 API
# ======================
async def call_model_api(prompt):
"""
异步调用 Nuwa Chat Completions API 返回文本输出
"""
url = f"{api_info['base_url']}/chat/completions"
headers = {
"Content-Type": "application/json",
"Authorization": f"Bearer {api_info['api_key']}"
}
payload = {
"model": api_info["model"],
"messages": [{"role": "user", "content": prompt}],
"max_output_tokens": 1024
}
async with aiohttp.ClientSession() as session:
try:
async with session.post(url, headers=headers, json=payload, timeout=60) as resp:
if resp.status == 200:
result = await resp.json()
# 获取模型输出
text = result.get("choices", [{}])[0].get("message", {}).get("content", "")
return text
else:
print(f"[ERROR] 请求失败: {resp.status} {await resp.text()}")
return ""
except Exception as e:
print(f"[ERROR] 请求异常: {e}")
return ""
# ======================
# 异步解析每篇论文
# ======================
async def parse_paper(paper):
title = paper.get("title") or paper.get("Conference", "")
summary = paper.get("summary", "")
keywords = paper.get("keywords", [])
# Prompt 完整规范
model_prompt = f"""
你是一个科研助手请根据以下信息分析论文内容并提炼关键信息总结成 JSON 格式要求
1. 输出 JSON 格式字段包含
- background: 论文背景简明说明研究动机和问题不抄原文摘要
- objective: 研究目标逻辑上支撑方法和贡献如果有多个目标每条编号从 1 开始 "1. …", "2. …"
- method: 研究方法说明论文如何实现目标逻辑上与目标和贡献连贯如果有多条方法每条编号从 1 开始
- results: 核心结论概括论文主要结果
- contribution: 论文贡献总结总结通过方法解决目标得到的价值与创新点如果有多条贡献每条编号从 1 开始
2. **要求分析提炼而非复述原文摘要**
- 用你自己的理解重组信息
- 确保逻辑顺序objective method contribution
- 精炼一针见血但保持完整信息
3. 如果某一项无法从信息中提取请置空 ""
4. 输出 JSON 时严格遵循字段名称不添加额外解释文字
示例输入
Title: Analyzing the Basic Elements of Mobile Viral Marketing-An Empirical Study
Summary: As personal communication tools mobile devices are platforms for word-of-mouth marketing. Given the assigned usefulness of mobile viral marketing, it is surprising to find relatively few studies directed at its basic elements, i.e., mobile viral content and consumers forwarding this content. The paper presents the findings of an online survey conducted to empirically investigate the consumers' intention to participate in different kinds of mobile viral marketing strategies and to identify the characteristics of mobile viral mavens in terms of their forwarding behaviour.
Keywords: mobile marketing, viral marketing, consumer behavior
示例输出
{{
"background": "移动设备为口碑传播提供了新渠道,但关于病毒营销基本元素的研究仍较少。",
"objective": "1. 分析移动病毒营销的核心组成及消费者转发行为,理解不同策略对参与意向的影响。",
"method": "2. 设计并实施在线问卷调查,收集消费者行为数据,并进行实证分析以验证策略效果。",
"results": "发现消费者对不同类型的移动病毒营销策略表现出不同的参与意向。",
"contribution": "3. 提炼移动病毒营销的关键元素及转发行为模式,为营销策略优化提供参考。"
}}
现在请根据以下信息生成 JSON
Title: {title}
Summary: {summary}
Keywords: {', '.join(keywords)}
"""
try:
model_output = await call_model_api(model_prompt)
parsed = json.loads(model_output) if model_output else {
"background": "",
"objective": "",
"method": "",
"results": "",
"contribution": ""
}
except Exception:
parsed = {
"background": "",
"objective": "",
"method": "",
"results": "",
"contribution": ""
}
paper_parsed = paper.copy()
paper_parsed["parsed_summary"] = parsed
return paper_parsed
async def parse_ieee_results_all_categories_async(json_data):
"""
解析抓取结果的所有分类并且去重重复文章title相同的只解析一次
使用缓存机制避免重复解析同一篇文章
"""
# 支持传入两种格式
results = json_data.get("results", {}) if "results" in json_data else json_data
parsed_results = defaultdict(list) # 最终返回结果,分类对应列表
seen_titles = set() # 全局去重,防止重复解析
cache = {} # 缓存已解析文章title -> 解析后的数据
for category, papers in results.items():
tasks = []
for paper in papers:
title = paper.get("title") or paper.get("Conference", "")
if title in seen_titles:
# 已解析过,直接复用缓存
tasks.append(asyncio.sleep(0, result=cache[title]))
else:
# 新文章,加入任务并记录
seen_titles.add(title)
task = asyncio.create_task(parse_paper(paper))
tasks.append(task)
if tasks:
parsed_papers = await asyncio.gather(*tasks)
# 保存到缓存并添加到对应分类
for parsed_paper in parsed_papers:
t = parsed_paper.get("title") or parsed_paper.get("Conference", "")
cache[t] = parsed_paper
parsed_results[category].append(parsed_paper)
return dict(parsed_results)

34
selenium/readme Executable file
View File

@ -0,0 +1,34 @@
1.安装浏览器(命令如下)
wget https://dl.google.com/linux/direct/google-chrome-stable_current_amd64.deb
sudo apt install ./google-chrome-stable_current_amd64.deb -y
2.下载成功之后,输入命令
google-chrome --version
# 输出示例Google Chrome 140.0.7339.80
3.下载对应 ChromeDriver
访问网站 https://googlechromelabs.github.io/chrome-for-testing/
根据上一步google-chrome输出的版本号信息选择相同的版本 示例如下:
wget https://storage.googleapis.com/chrome-for-testing-public/140.0.7339.80/linux64/chromedriver-linux64.zip
unzip chromedriver_linux64.zip
cd chromedriver_linux64
sudo mv chromedriver /usr/local/bin/
sudo chmod +x /usr/local/bin/chromedriver
然后验证
chromedriver --version
# 输出示例ChromeDriver 140.0.7339.80 (670b6f192f4668d2ac2c06bd77ec3e4eeda7d648-refs/branch-heads/7339_41@{#3})
4.下载这个xvfb-run,运行示例
xvfb-run python main.py
第一步构建镜像
docker build -t selenium:latest .
第二步启动容器:
docker run -d \
--name selenium-container \
-p 5001:5000 \
selenium

7
selenium/requirements.txt Executable file
View File

@ -0,0 +1,7 @@
Flask
flask-cors
aiohttp
requests
beautifulsoup4
selenium
lxml

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

244
selenium/utils/arxiv.py Executable file
View File

@ -0,0 +1,244 @@
# coding=utf-8
import csv
import json
import sys
import os
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
import random
import time
from bs4 import BeautifulSoup
import traceback
from config import create_browser,_scroll_into_view
from selenium.webdriver import ActionChains
from selenium.webdriver.common.by import By
from selenium.webdriver.support.select import Select
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import (
TimeoutException,
NoSuchElementException,
)
# coding=utf-8
import csv
import json
from bs4 import BeautifulSoup
import os
import random
import time
import traceback
from config import create_browser,_scroll_into_view
from selenium.webdriver import ActionChains
from selenium.webdriver.common.by import By
from selenium.webdriver.support.select import Select
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import (
TimeoutException,
NoSuchElementException,
)
def extract_row_info(row, driver):
"""抓取单条记录信息并返回字典"""
_scroll_into_view(driver, row)
# 原始链接
try:
originalElem = row.find_element(By.CSS_SELECTOR, "div > p > a")
originalLink=originalElem.get_attribute("href")
except Exception:
originalLink = ""
# 标题
try:
title = row.find_element(By.CSS_SELECTOR, "p.title.is-5.mathjax").text.strip()
except Exception:
title = ""
# 作者
try:
authors = [a.text.strip() for a in
row.find_element(By.CSS_SELECTOR, "p.authors").find_elements(By.TAG_NAME, "a")]
except Exception:
authors = []
# 提取时间
try:
info_p = row.find_element(By.CSS_SELECTOR, "p.is-size-7").text
date = ""
for part in info_p.split(";"):
if "Submitted" in part:
date = part.replace("Submitted", "").strip()
break
except Exception:
date = ""
print("原文url:", originalLink)
print("论文标题:", title)
print("作者列表:", authors)
print("提交时间:", date)
time.sleep(1)
try:
# 点击打开新页面
originalElem.click()
time.sleep(2)
# pdf链接
try:
pdf_link_elem = WebDriverWait(driver, 10).until(
EC.presence_of_element_located((By.CSS_SELECTOR, "a.abs-button.download-pdf"))
)
# 获取 href
pdf_link = pdf_link_elem.get_attribute("href")
except Exception:
pdf_link = ""
print("PDF 链接:", pdf_link)
# 获取摘要的 innerHTML
abstract_elem = driver.find_element(By.CSS_SELECTOR, "blockquote.abstract.mathjax")
html_text = abstract_elem.get_attribute("innerHTML").replace("<br>", "\n").strip()
# 使用 BeautifulSoup 去除所有标签
soup = BeautifulSoup(html_text, "html.parser")
summary_text = soup.get_text().strip()
except Exception as e:
pdf_link = ""
summary_text = ""
print("[错误] 获取摘要失败:", e)
finally:
# 回退
try:
driver.back()
time.sleep(1)
except Exception as e:
print("[警告] 页面回退失败:", e)
print("摘要:", summary_text)
time.sleep(1)
return {
"title": title, # 确保在函数内有定义
"author": authors,
"site":"arxiv",
"originalLink": originalLink,
"pdfUrl": pdf_link,
"date":date,
"summary": summary_text
}
def crawl_current_sort(driver, limit):
"""抓取当前排序下的 limit 条记录"""
fetched_count = 0
results = []
while fetched_count < limit:
try:
WebDriverWait(driver, 10).until(
EC.presence_of_element_located(
(By.CSS_SELECTOR, '#main-container > div.content > ol > li:nth-child(1) > div > p > a'))
)
except TimeoutException:
print("[警告] 本页结果表格未出现,尝试继续")
time.sleep(2)
rows = driver.find_elements(By.CSS_SELECTOR, '#main-container > div.content > ol > li')
for idx, row in enumerate(rows, 1):
if fetched_count >= limit:
break
try:
info = extract_row_info(row, driver)
if isinstance(info, dict):
results.append(info)
fetched_count += 1
time.sleep(random.uniform(0.5, 1.2))
except Exception as e:
print(f"[错误] {e}")
traceback.print_exc()
try:
if driver.window_handles:
driver.switch_to.window(driver.window_handles[0])
except Exception:
pass
if fetched_count >= limit:
break
# 翻页
try:
# 尝试定位唯一的下一页按钮(支持不同分页情况)
next_btn = driver.find_element(
By.CSS_SELECTOR,
"#main-container > div.content > nav:nth-child(3) > a.pagination-next" # 修复了多余空格
)
# 判断是否超过限制
if not next_btn.is_enabled() :
break
_scroll_into_view(driver, next_btn)
try:
next_btn.click()
except Exception:
try:
ActionChains(driver).move_to_element(next_btn).pause(0.1).click(next_btn).perform()
except Exception:
driver.execute_script("arguments[0].click();", next_btn)
print("进入下一页")
time.sleep(random.uniform(1, 1.5))
except Exception:
print("[INFO] 已到最后一页或翻页失败")
break
return results
def arxiv(keyword, limit, sort_options=None):
driver = create_browser()
wait = WebDriverWait(driver, 15)
all_results = {}
if not sort_options:
sort_options = ["publication_time"] # 默认时间
try:
driver.get("https://arxiv.org/")
wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, "#header > div.search-block.level-right > form > div > div:nth-child(1) > input"))).send_keys(keyword)
driver.find_element(By.CSS_SELECTOR, "#header > div.search-block.level-right > form > div > button").click()
time.sleep(5)
for sort_name in sort_options:
if sort_name == "publication_time":
print("[INFO] 使用时间排序(默认)")
# Arxiv 默认就是时间,不需要额外点击
pass
elif sort_name == "relevance":
print("[INFO] 切换到最新排序")
try:
# 点击排序下拉框
order_select_elem = WebDriverWait(driver, 10).until(
EC.presence_of_element_located((By.ID, "order"))
)
order_select = Select(order_select_elem)
time.sleep(1)
target_text = "Relevance"
for option in order_select.options:
if option.text.strip().lower() == target_text.lower():
order_select.select_by_value(option.get_attribute("value"))
print(f"已选择排序: {option.text} -> {option.get_attribute('value')}")
break
time.sleep(2)
except Exception as e:
print(f"[WARN] 切换最新排序失败: {e}")
# 抓取当前排序的结果
results = crawl_current_sort(driver, limit)
all_results[sort_name] = results
finally:
try:
driver.quit()
except Exception:
pass
print("[DONE] arxiv 抓取完成")
print(json.dumps(all_results, ensure_ascii=False, indent=2))
return all_results
if __name__ == '__main__':
keyword = "graphrag"
limit = 100
arxiv(keyword, limit, ["relevance"])

266
selenium/utils/ieeeXplore.py Executable file
View File

@ -0,0 +1,266 @@
# coding=utf-8
import csv
import json
import time
import random
import traceback
import sys
import os
from bs4 import BeautifulSoup
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
from selenium.webdriver.common.by import By
from selenium.webdriver.support.select import Select
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import (
NoSuchElementException,
TimeoutException,
ElementClickInterceptedException
)
from selenium.webdriver.common.action_chains import ActionChains
from config import create_browser,_scroll_into_view
import json
import re
import requests
from bs4 import BeautifulSoup
def get_abstract_in_new_tab(url, headers=None, timeout=100):
"""
通过论文详情页 URL 获取摘要无需 Selenium
"""
if headers is None:
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/140.0.0.0 Safari/537.36"
}
try:
resp = requests.get(url, headers=headers, timeout=timeout)
resp.raise_for_status()
except requests.RequestException as e:
print(f"[ERROR] 请求页面失败: {e}")
return ""
# 使用 BeautifulSoup 解析页面
soup = BeautifulSoup(resp.text, "lxml")
# 从 JavaScript 元数据中提取摘要
match = re.search(r"xplGlobal\.document\.metadata\s*=\s*(\{.*?\});", resp.text, re.S)
if match:
metadata_json = match.group(1)
try:
metadata = json.loads(metadata_json)
abstract = metadata.get("abstract", "")
except json.JSONDecodeError:
abstract = ""
else:
abstract = ""
return abstract
# ---------主函数 ---------
def extract_row_info(row, driver):
"""抓取单条记录信息并返回字典"""
try:
urlIndex = row.find_element(By.CLASS_NAME, "fw-bold")
relative_link = urlIndex.get_attribute("href")
title = urlIndex.text.strip()
base_url = "https://ieeexplore.ieee.org"
if relative_link.startswith("/"):
originalLink = base_url + relative_link
else:
originalLink = relative_link
except Exception as e:
print(f"[WARN] 获取论文标题或链接失败: {e}")
title = ""
originalLink = ""
print("论文标题:", title)
print("论文链接:", originalLink)
try:
authors = [a.text for a in row.find_elements(By.CSS_SELECTOR, 'xpl-authors-name-list a span')]
authors = [a for a in authors if a.strip()]
except Exception as e:
print(f"[WARN] 获取作者列表失败: {e}")
authors = []
print("作者列表:", authors)
try:
Conference = row.find_element(By.CSS_SELECTOR, "a[xplhighlight]").text.strip()
print("Conference:", Conference)
except:
print("未找到会议信息")
try:
info_text = row.find_element(By.CSS_SELECTOR, "div.publisher-info-container").text
# info_text 类似 "Year: 2025 | Conference Paper | Publisher: IEEE"
parts = [p.strip() for p in info_text.split('|')]
date = parts[0].replace("Year:", "").strip()
paper_type = parts[1] if len(parts) > 1 else ""
print(f"年份: {date}, 类型: {paper_type}")
except:
date = ""
paper_type = ""
print("未找到年份或类型")
time.sleep(1)
abstract = get_abstract_in_new_tab(originalLink)
print("摘要:", abstract)
time.sleep(2) # 等待页面加载完成
return {
"title": title, # 确保函数里有定义
"originalLink": originalLink,
"author": authors,
"type":paper_type,
"Conference":Conference,
"date": date,
"site":"ieeeXplore",
"summary": abstract
}
def crawl_current_sort(driver, limit):
"""抓取当前排序下的 limit 条记录"""
fetched_count = 0
results = []
while fetched_count < limit:
try:
WebDriverWait(driver, 10).until(
EC.presence_of_element_located((By.CSS_SELECTOR, '#LayoutWrapper > div > div > div.stats-search-page.xpl-serp.ng2-app > div > xpl-root > header > xpl-header > div > xpl-navbar > div > div.top-navbar > div.left-side-container > div > div.xplore-logo-wrapper > xpl-xplore-logo > div > a > img'))
)
except TimeoutException:
print("[警告] 本页结果表格未出现,尝试继续")
time.sleep(2)
rows = driver.find_elements(By.CLASS_NAME, 'List-results-items')
print(f'有多少条论文信息:{len(rows)}')
for i in range(len(rows)):
print(f'{i+1}')
row = driver.find_elements(By.CLASS_NAME, 'List-results-items')[i]
if fetched_count >= limit:
break
try:
info = extract_row_info(row, driver)
if info: # 只有 info 有效才追加
results.append(info)
fetched_count += 1
time.sleep(2)
except Exception as e:
print(f"[错误] 抓取 row 失败: {e}")
traceback.print_exc()
# 如果窗口还存在,强制切回主窗口,避免死锁
try:
if driver.window_handles:
driver.switch_to.window(driver.window_handles[0])
except Exception:
pass
continue # 遇错后继续下一个 row不要影响整体
# 翻页
try:
next_btn = driver.find_element(By.CSS_SELECTOR,"#xplMainContent > div.ng-SearchResults.row.g-0 > div.col > xpl-paginator > div.pagination-bar.hide-mobile.text-base-md-lh > ul > li.next-btn > button")
if next_btn.is_enabled() and fetched_count < limit:
_scroll_into_view(driver, next_btn)
try:
next_btn.click()
except Exception:
try:
ActionChains(driver).move_to_element(next_btn).pause(1).click(next_btn).perform()
except Exception:
driver.execute_script("arguments[0].click();", next_btn)
print("进入下一页")
time.sleep(random.uniform(1, 2))
else:
break
except NoSuchElementException:
print("[INFO] 已到最后一页或翻页不存在")
break
except Exception as e:
print(f"[错误] 翻页失败: {e}")
break
return results
def ieeeXplore(keyword, limit,sort_options=None):
"""主函数:三种排序抓取"""
driver = create_browser()
wait = WebDriverWait(driver, 15)
all_results = {}
if not sort_options:
sort_options = ["relevance"] # 默认相关性
try:
driver.get("https://ieeexplore.ieee.org/Xplore/home.jsp")
try:
accept_btn = driver.find_element(By.CSS_SELECTOR, "button.osano-cm-accept-all")
accept_btn.click()
print("有弹窗,点击已点击全部接受按钮")
# 等待弹窗消失
WebDriverWait(driver, 10).until(
EC.invisibility_of_element(accept_btn)
)
except NoSuchElementException:
# 没有弹窗就继续
print("未检测到全部接受按钮")
input_box = wait.until(EC.presence_of_element_located(
(By.CSS_SELECTOR, "input.Typeahead-input")
))
input_box.clear()
input_box.send_keys(keyword)
# 定位搜索按钮
search_btn = driver.find_element(By.CSS_SELECTOR, "button.fa.fa-search.stats-Global_Search_Icon")
search_btn.click()
time.sleep(4)
for sort_name in sort_options:
try:
if sort_name == "publication_time":
print("[INFO] 使用时间进行排序")
# 定位下拉按钮
dropdown_btn = wait.until(
EC.presence_of_element_located((By.CSS_SELECTOR, "button.dropdown-toggle.xpl-btn-secondary"))
)
# 滚动到下拉按钮可见
driver.execute_script("arguments[0].scrollIntoView(true);", dropdown_btn)
# 用 JS 点击下拉按钮(避免 ElementNotInteractable
driver.execute_script("arguments[0].click();", dropdown_btn)
# 等待 0.5~1 秒让下拉选项渲染
time.sleep(1)
# 定位 "Newest" 选项
newest_option = wait.until(
EC.presence_of_element_located(
(By.XPATH,
"//button[contains(@class,'dropdown-item') and contains(normalize-space(.),'Newest')]")
)
)
# 用 JS 点击选项
driver.execute_script("arguments[0].click();", newest_option)
time.sleep(2)
elif sort_name == "relevance":
print("[INFO] 使用相关性排序(默认)")
except Exception as e:
print(f"[WARN] 点击排序 {sort_name} 失败:", e)
# 抓取当前排序下的结果
results = crawl_current_sort(driver, limit)
all_results[sort_name] = results
finally:
try:
driver.quit()
except Exception:
pass
print("[DONE] PDF处理完成")
print(json.dumps(all_results, ensure_ascii=False, indent=2))
return all_results
if __name__ == '__main__':
keyword = "bert"
limit=100
# ieeeXplore(keyword, limit, ["relevance"])
# 搜最新
# ieeeXplore(keyword, limit, ["publication_time"])
# 先相关性再最新
ieeeXplore(keyword, limit, ["relevance", "publication_time"])

253
selenium/utils/pubmed.py Executable file
View File

@ -0,0 +1,253 @@
# coding=utf-8
import csv
import json
import time
import sys
import os
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
import random
import traceback
from selenium.webdriver.common.by import By
from selenium.webdriver.support.select import Select
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import (
TimeoutException, NoSuchElementException,
)
from selenium.webdriver.common.action_chains import ActionChains
from config import create_browser,_scroll_into_view
# ---------主函数 ---------
def extract_row_info(row, driver):
"""抓取单条记录信息并返回字典"""
try:
url_elem = row.find_element(By.CSS_SELECTOR, "a.docsum-title")
title = url_elem.text.strip()
originalLink = url_elem.get_attribute("href")
except Exception as e:
title = ""
originalLink = ""
print("[错误] 获取论文标题或链接失败:", e)
#抓取作者以及引用信息
try:
authors = row.find_element(By.XPATH,
".//span[contains(@class,'docsum-authors') and contains(@class,'full-authors')]").text
citation = row.find_element(By.XPATH,
".//span[contains(@class,'docsum-journal-citation') and contains(@class,'full-journal-citation')]").text
except:
citation = ""
authors = ""
print("论文原处:",originalLink)
print("论文标题:", title)
print("作者列表:", authors)
print("论文引用:", citation)
try:
downloadElem = row.find_element(By.XPATH,".//div[contains(@class,'docsum-citation') and contains(@class,'full-citation')]")
downloadText = downloadElem.text
except:
downloadText = ""
time.sleep(1)
url_elem.click()
time.sleep(3) # 等待加载
# 摘要
try:
abstract_elem = driver.find_element(By.CSS_SELECTOR, "#eng-abstract p")
abstract_text = abstract_elem.text.strip()
except NoSuchElementException:
abstract_text = ""
print("摘要:", abstract_text)
# 关键词(可能没有)
try:
keyword_elem = driver.find_element(By.CSS_SELECTOR, "#abstract > p")
keyword_text = keyword_elem.text.replace("Keywords:", "").strip()
except NoSuchElementException:
keyword_text = ""
print("关键词:", keyword_text)
pdf_url = ""
if "Free PMC article" in downloadText:
print("✅ 该文章是免费文章,可以下载")
original_handle = driver.current_window_handle
original_handles = driver.window_handles.copy()
# --- 点击下载按钮 ---
print("[步骤] 跳转下载界面 ...")
try:
pdf_selector = WebDriverWait(driver, 5).until(
EC.presence_of_element_located(
(By.XPATH, "//a[contains(@class,'link-item') and contains(@class,'pmc')]"))
)
except Exception:
print("❌ 找不到 PMC PDF 链接,跳过下载")
pdf_selector = None
if pdf_selector:
try:
pdf_selector.click()
except Exception:
driver.execute_script("arguments[0].click();", pdf_selector)
print("[步骤] 点击完成,等待新窗口/页面...")
# --- 判断是否有新窗口 ---
try:
WebDriverWait(driver, 5).until(lambda d: len(d.window_handles) > len(original_handles))
new_handle = [h for h in driver.window_handles if h not in original_handles][0]
driver.switch_to.window(new_handle)
print("[步骤] 已切换到新窗口:", new_handle)
except TimeoutException:
print("[步骤] 没有新窗口,在当前窗口继续处理。")
time.sleep(1)
# --- 切换后重新查找 PDF 元素 ---
try:
print("[步骤] 等待 PDF 按钮出现...")
pdf_a = WebDriverWait(driver, 10).until(
EC.presence_of_element_located(
(By.XPATH, "//a[contains(@class,'usa-button') and contains(@href,'pdf/')]")
)
)
pdf_url = pdf_a.get_attribute("href")
if pdf_url:
print("📄 PDF 链接:", pdf_url)
except Exception as e:
print("❌ 获取 PDF 失败:", e)
finally:
# --- 关闭并切回原窗口 ---
current = driver.current_window_handle
if current != original_handle:
driver.close()
driver.switch_to.window(original_handle)
time.sleep(1)
print("[步骤] 已切回原窗口。")
else:
print("❌ 该文章不是免费文章")
# 回退到上一级
driver.back()
time.sleep(2) # 等待页面加载完成
return {
"title": title, # 确保函数里有定义
"author": authors,
"site":"pubmed",
"originalLink":originalLink,
"citation": citation,
"pdfUrl": pdf_url,
"keywords": keyword_text,
"summary": abstract_text
}
def crawl_current_sort(driver, limit):
"""抓取当前排序下的 limit 条记录"""
fetched_count = 0
results = []
while fetched_count < limit:
try:
WebDriverWait(driver, 10).until(
EC.presence_of_element_located((By.CSS_SELECTOR, '#search-form > div.inner-wrap > a.pubmed-logo > img'))
)
except TimeoutException:
print("[警告] 本页结果表格未出现,尝试继续")
time.sleep(2)
rows = driver.find_elements(By.XPATH, '//*[@id="search-results"]/section/div[2]/div/article')
for row in rows:
if fetched_count >= limit:
break
try:
info = extract_row_info(row, driver)
if info: # 只有 info 有效才追加
results.append(info)
fetched_count += 1
time.sleep(2)
except Exception as e:
print(f"[错误] 抓取 row 失败: {e}")
traceback.print_exc()
# 如果窗口还存在,强制切回主窗口,避免死锁
try:
if driver.window_handles:
driver.switch_to.window(driver.window_handles[0])
except Exception:
pass
continue # 遇错后继续下一个 row不要影响整体
# 翻页
try:
next_btn = driver.find_element(By.CSS_SELECTOR,"#search-results > div.top-wrapper > div.top-pagination > button.button-wrapper.next-page-btn > img.chevron-icon.enabled-icon")
if next_btn.is_enabled() and fetched_count < limit:
_scroll_into_view(driver, next_btn)
try:
next_btn.click()
except Exception:
try:
ActionChains(driver).move_to_element(next_btn).pause(0.1).click(next_btn).perform()
except Exception:
driver.execute_script("arguments[0].click();", next_btn)
time.sleep(random.uniform(1, 2))
else:
break
except NoSuchElementException:
print("[INFO] 已到最后一页或翻页不存在")
break
except Exception as e:
print(f"[错误] 翻页失败: {e}")
break
return results
def pubmed(keyword, limit,sort_options=None):
"""主函数:三种排序抓取"""
driver = create_browser()
wait = WebDriverWait(driver, 15)
all_results = {}
if not sort_options:
sort_options = ["relevance"] # 默认相关性
try:
driver.get("https://pubmed.ncbi.nlm.nih.gov/")
wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, "#id_term"))).send_keys(keyword)
time.sleep(1)
driver.find_element(By.CSS_SELECTOR, "#search-form > div > div.search-input > div > button").click()
time.sleep(4)
for sort_name in sort_options:
try:
if sort_name == "publication_time":
print("[INFO] 使用时间进行排序")
# 等待下拉框出现
sort_elem = driver.find_element(By.ID, "id_sort")
sort_select = Select(sort_elem)
sort_select.select_by_value("pubdate") # 或 select_by_visible_text("Publication date")
time.sleep(2)
elif sort_name == "relevance":
print("[INFO] 使用相关性排序(默认)")
except Exception as e:
print(f"[WARN] 点击排序 {sort_name} 失败:", e)
# 抓取当前排序下的结果
results = crawl_current_sort(driver, limit)
all_results[sort_name] = results
finally:
try:
driver.quit()
except Exception:
pass
print("[DONE] PDF处理完成")
print(json.dumps(all_results, ensure_ascii=False, indent=2))
return all_results
if __name__ == '__main__':
keyword = "bert"
limit=100
# pubmed(keyword, limit, ["relevance"])
# 搜最新
# pubmed(keyword, limit, ["publication_time"])
# 先相关性再最新
pubmed(keyword, limit, ["relevance", "publication_time"])

209
selenium/utils/scienceDirect.py Executable file
View File

@ -0,0 +1,209 @@
# coding=utf-8
import json
import time
import random
import traceback
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import (
TimeoutException, NoSuchElementException,
)
from selenium.webdriver.common.action_chains import ActionChains
from config import create_browser,_scroll_into_view
def find_valid_detail_tab(driver, origin_handle, timeout=10):
"""从现有句柄中挑出一个不是原始窗口的有效新标签页。"""
end_time = time.time() + timeout
while time.time() < end_time:
for handle in driver.window_handles:
if handle != origin_handle:
try:
driver.switch_to.window(handle)
current_url = driver.current_url
if not current_url.startswith("chrome://") and current_url != "about:blank":
print(f"[切换窗口] 成功 → {driver.title}")
return handle
except Exception:
pass
time.sleep(0.5)
raise Exception("未能在规定时间内找到有效详情页窗口")
# ---------主函数 ---------
def extract_row_info(row, driver):
"""抓取单条记录信息并返回字典"""
try:
type_text = row.find_element(By.XPATH, 'div / div / div[1] / span').text.strip()
except Exception:
type_text = ""
# 如果不是期刊论文,直接跳过
if type_text != "Research article":
return None
title_element = row.find_element(By.XPATH, './/h2/a/span/span/span')
title = title_element.text.strip()
print("论文名称",title)
try:
# 提取期刊名
journal_element = row.find_element(By.XPATH, './/div[@class="SubType hor text-xs u-clr-grey6"]//a')
source = journal_element.text.strip()
# 提取时间(在 srctitle-date-fields 里,期刊名后面的 span
time_element = row.find_element(By.XPATH,'.//div[@class="SubType hor text-xs u-clr-grey6"]//span[@class="srctitle-date-fields"]/span[last()]')
date = time_element.text.strip()
except:
source=""
date=""
print(f"未找到期刊或时间")
print(f"期刊: {source} | 时间: {date}")
print("类型:", type_text)
time.sleep(1)
origin = driver.current_window_handle
existing_handles = driver.window_handles
try:
_scroll_into_view(driver, title_element)
title_element.click()
except Exception:
try:
ActionChains(driver).move_to_element(title_element).pause(1).click(title_element).perform()
except Exception:
driver.execute_script("arguments[0].click();", title_element)
try:
WebDriverWait(driver, 10).until(EC.new_window_is_opened(existing_handles))
except TimeoutException:
print("[警告] 未检测到新窗口,跳过")
return None
try:
detail_tab = find_valid_detail_tab(driver, origin)
if detail_tab not in driver.window_handles:
return None
driver.switch_to.window(detail_tab)
time.sleep(3)
#获取摘要信息
abstract_elem = driver.find_elements(By.CSS_SELECTOR, "#sp0010")
summary_text = abstract_elem.text.strip()
print("摘要:", summary_text)
authors = driver.find_elements(By.CSS_SELECTOR, "#author-group .react-xocs-alternative-link")
author_names = [a.text.strip() for a in authors if a.text.strip()]
print(author_names)
time.sleep(1)
finally:
# 仅关闭非原窗口
if driver.current_window_handle != origin:
driver.close()
driver.switch_to.window(origin)
time.sleep(random.uniform(0.5, 1.5))
return {
"title": title, # 确保函数里有定义
"author": author_names,
"source": source,
"summary": summary_text
}
def crawl_current_sort(driver, limit):
"""抓取当前排序下的 limit 条记录"""
fetched_count = 0
results = []
while fetched_count < limit:
try:
WebDriverWait(driver, 10).until(
EC.presence_of_element_located((By.XPATH, '//*[@id="main_content"]/div[3]/div[1]/div[2]/div[2]'))
)
except TimeoutException:
print("[警告] 本页结果表格未出现,尝试继续")
time.sleep(2)
rows = driver.find_elements(By.XPATH, '// *[ @ id = "srp-results-list"] / ol / li')
for row in rows:
if fetched_count >= limit:
break
try:
info = extract_row_info(row, driver)
if info:
results.append(info)
time.sleep(2)
fetched_count += 1
except Exception as e:
print(f"[错误] {e}")
traceback.print_exc()
try:
if driver.window_handles:
driver.switch_to.window(driver.window_handles[0])
except Exception:
pass
# 翻页
try:
next_btn = driver.find_element(By.XPATH, "//*[@id='srp-pagination']/li[2]/a/span")
if not next_btn.is_enabled() or fetched_count >= limit:
break
_scroll_into_view(driver, next_btn)
try:
next_btn.click()
except Exception:
try:
ActionChains(driver).move_to_element(next_btn).pause(1).click(next_btn).perform()
except Exception:
driver.execute_script("arguments[0].click();", next_btn)
time.sleep(5)
except Exception:
print("[INFO] 已到最后一页或翻页失败")
break
return results
def scienceDirect(keyword, limit):
"""主函数:三种排序抓取"""
driver = create_browser()
wait = WebDriverWait(driver, 15)
all_results = {}
sortings = {
"relevance": None,
"publication_time": "#srp-sorting-options > div > a > span",
}
try:
driver.get("https://www.sciencedirect.com/")
search_input = WebDriverWait(driver, 10).until(
EC.presence_of_element_located((By.ID, "qs"))
)
search_input.send_keys(keyword)
time.sleep(2)
search_button = driver.find_element(By.XPATH, '//*[@id="searchbar"]/div/div/form/div[2]/button')
search_button.click()
time.sleep(3)
for sort_name, css_selector in sortings.items():
if css_selector:
try:
driver.find_element(By.CSS_SELECTOR, css_selector).click()
time.sleep(5)
except Exception:
print(f"[WARN] 点击排序 {sort_name} 失败")
results = crawl_current_sort(driver, limit)
all_results[sort_name] = results
finally:
try:
driver.quit()
except Exception:
pass
print("[DONE] PDF处理完成")
print(json.dumps(all_results, ensure_ascii=False, indent=2))
return json.dumps(all_results, ensure_ascii=False, indent=2)
if __name__ == '__main__':
keyword = "graphrag"
limit=10
scienceDirect(keyword,limit)

275
selenium/utils/springerLink.py Executable file
View File

@ -0,0 +1,275 @@
# coding=utf-8
import csv
import json
import time
import random
import traceback
import sys
import os
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import (
StaleElementReferenceException,
TimeoutException, NoSuchElementException,
)
from selenium.webdriver.common.action_chains import ActionChains
from config import create_browser,_scroll_into_view
def click_element_safe(driver, locator, retries=3, wait_time=10):
"""安全点击元素,防止 StaleElementReference"""
for attempt in range(retries):
try:
elem = WebDriverWait(driver, wait_time).until(
EC.element_to_be_clickable(locator)
)
elem.click()
return elem
except StaleElementReferenceException:
print(f"StaleElementReferenceException, retry {attempt+1}/{retries}")
time.sleep(0.5)
raise Exception("点击失败,元素持续不可用")
# ---------主函数 ---------
def extract_row_info(row, driver):
"""抓取单条记录信息并返回字典"""
_scroll_into_view(driver, row)
# 类型
try:
type_elem = row.find_element(
By.CSS_SELECTOR,
'div.app-card-open__main > div.app-entitlement > div > div > span'
)
type_text = type_elem.text.strip()
print("类型:", type_text)
except Exception:
return None
if type_text.lower() not in ["conference paper", "article"]:
return None
# 标题
try:
title_element = row.find_element(By.CSS_SELECTOR, "div.app-card-open__main h3.app-card-open__heading a")
title = title_element.text.strip()
except Exception:
return None
# 作者
try:
authors_elem = row.find_element(By.CSS_SELECTOR, "div.app-card-open__authors span[data-test='authors']")
authors = authors_elem.text.strip()
except Exception:
authors = None
# 期刊/书籍来源
try:
source_elem = row.find_element(By.CSS_SELECTOR, "div.app-card-open__authors a[data-test='parent']")
source = source_elem.text.strip()
except Exception:
source = None
# 发表年份/提交时间
try:
date_elem = row.find_element(By.CSS_SELECTOR, "div.app-card-open__meta [data-test='published']")
date = date_elem.text.strip()
except Exception:
date = None
print("论文名称:", title)
print("作者:", authors)
print("期刊来源:", source)
print("提交时间:", date)
# 点击标题进入详情页并获取摘要
summary_text = ""
try:
title_locator = (By.CSS_SELECTOR, "div.app-card-open__main h3.app-card-open__heading a")
click_element_safe(driver, title_locator)
# 等待详情页摘要加载
try:
abstract_elem = WebDriverWait(driver, 10).until(
EC.presence_of_element_located((By.CSS_SELECTOR, 'section[data-title="Abstract"]'))
)
# 如果存在“▽ More”展开全文
try:
more_link = abstract_elem.find_element(By.XPATH, ".//a[contains(text(), '')]")
driver.execute_script("arguments[0].click();", more_link)
time.sleep(0.3)
except NoSuchElementException:
pass
summary_text = abstract_elem.text.strip()
if summary_text.startswith("Abstract"):
summary_text = summary_text[len("Abstract"):].lstrip("\n").strip()
except (TimeoutException, NoSuchElementException):
summary_text = ""
print("摘要:", summary_text)
time.sleep(1)
finally:
# 回退到列表页
driver.back()
time.sleep(random.uniform(1.5, 2.5))
return {
"title": title,
"author": authors,
"source": source,
"date": date,
"summary": summary_text
}
def crawl_current_sort(driver, limit):
"""抓取当前排序下的 limit 条记录"""
fetched_count = 0
results = []
while fetched_count < limit:
try:
WebDriverWait(driver, 10).until(
EC.presence_of_element_located((By.CSS_SELECTOR, '#search-submit'))
)
except TimeoutException:
print("[警告] 本页结果表格未出现,尝试继续")
time.sleep(2)
rows = driver.find_elements(By.CSS_SELECTOR,'#main > div > div > div > div:nth-child(2) > div:nth-child(2) > ol > li')
for i in range(len(rows)):
row = driver.find_elements(By.CSS_SELECTOR,'#main > div > div > div > div:nth-child(2) > div:nth-child(2) > ol > li')[i]
if fetched_count >= limit:
break
try:
info = extract_row_info(row, driver)
if info:
results.append(info)
time.sleep(2)
fetched_count += 1
except Exception as e:
print(f"[错误] {e}")
traceback.print_exc()
try:
if driver.window_handles:
driver.switch_to.window(driver.window_handles[0])
except Exception:
pass
# 翻页
try:
# 尝试定位唯一的下一页按钮(支持不同分页情况)
next_btn = driver.find_element(
By.CSS_SELECTOR,
"ul.eds-c-pagination a[rel='next'], ul.eds-c-pagination a[data-test='next-page']"
)
# 判断是否超过限制
if not next_btn.is_enabled() or fetched_count >= limit:
break
_scroll_into_view(driver, next_btn)
try:
next_btn.click()
except Exception:
try:
ActionChains(driver).move_to_element(next_btn).pause(0.1).click(next_btn).perform()
except Exception:
driver.execute_script("arguments[0].click();", next_btn)
time.sleep(1)
except Exception:
print("[INFO] 已到最后一页或翻页失败")
break
return results
def springerLink(keyword, limit, sort_options=None):
"""主函数:根据选择的排序抓取 SpringerLink 结果"""
driver = create_browser()
wait = WebDriverWait(driver, 15)
all_results = {}
if not sort_options:
sort_options = ["relevance"] # 默认相关性
try:
driver.get("http://link.springer.com/")
print("网站标题",driver.title)
print("当前 URL:", driver.current_url)
try:
accept_cookies_btn = WebDriverWait(driver, 10).until(
EC.element_to_be_clickable(
(By.CSS_SELECTOR, "button[data-cc-action='accept']")
)
)
accept_cookies_btn.click()
print("[INFO] 已点击 Accept all cookies")
except Exception:
print("[INFO] 没有发现 Cookies 弹窗")
try:
search_input = WebDriverWait(driver, 10).until(
EC.visibility_of_element_located((By.CSS_SELECTOR, "#homepage-search"))
)
except TimeoutException:
print("[ERROR] 搜索框未加载完成")
# 输入搜索关键词
search_input = WebDriverWait(driver, 10).until(
EC.element_to_be_clickable((By.CSS_SELECTOR, "#homepage-search"))
)
driver.execute_script("arguments[0].scrollIntoView(true);", search_input)
search_input.clear()
search_input.send_keys(keyword)
time.sleep(2)
driver.find_element(By.CSS_SELECTOR, "#main > div.app-homepage-hero > div > search > form > div > button").click()
time.sleep(2)
# 遍历用户选择的排序
for sort_name in sort_options:
if sort_name == "relevance":
print("[INFO] 使用相关性排序(默认)")
# SpringerLink 默认就是相关性,不需要额外点击
pass
elif sort_name == "publication_time":
print("[INFO] 切换到最新排序")
try:
# 点击排序下拉框
wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, "#search-select"))).click()
time.sleep(1)
# 点击 "Newest First" 选项
driver.find_element(By.CSS_SELECTOR, "#search-select > option:nth-child(2)").click()
time.sleep(2)
except Exception as e:
print(f"[WARN] 切换最新排序失败: {e}")
# 抓取当前排序的结果
results = crawl_current_sort(driver, limit)
all_results[sort_name] = results
finally:
try:
driver.quit()
except Exception:
pass
print("[DONE] SpringerLink 抓取完成")
print(json.dumps(all_results, ensure_ascii=False, indent=2))
return all_results
if __name__ == '__main__':
keyword = "graphrag"
limit=100
# 默认只搜相关性
# springerLink(keyword, limit, ["relevance"])
# 搜最新
springerLink(keyword, limit, ["publication_time"])
# 先相关性再最新
springerLink(keyword, limit, ["relevance", "publication_time"])

243
selenium/utils/wangfang.py Executable file
View File

@ -0,0 +1,243 @@
# coding=utf-8
import json
import time
import random
import sys
import os
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
import traceback
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.common.action_chains import ActionChains
from config import create_browser,_scroll_into_view
def find_valid_detail_tab(driver, origin_handle, timeout=10):
"""从现有句柄中挑出一个不是原始窗口的有效新标签页。"""
end_time = time.time() + timeout
while time.time() < end_time:
for handle in driver.window_handles:
if handle != origin_handle:
try:
driver.switch_to.window(handle)
current_url = driver.current_url
if not current_url.startswith("chrome://") and current_url != "about:blank":
print(f"[万方切换窗口] 成功 → {driver.title}")
return handle
except Exception:
pass
time.sleep(0.5)
raise Exception("未能在规定时间内找到有效详情页窗口")
# ---------主函数 ---------
def extract_row_info(row, driver):
"""抓取单条记录信息并返回字典"""
try:
type_text = row.find_element(By.XPATH, 'td[6]').text.strip()
except Exception:
type_text = ""
# 如果不是期刊论文,直接跳过
if type_text != "期刊论文":
return None
title_element= row.find_element(By.XPATH, 'td[2]/span[1]')
title = row.find_element(By.XPATH, 'td[2]/span[1]').text.strip()
author_area=row.find_element(By.XPATH,'td[3]')
authors = author_area.find_elements(By.XPATH, ".//span[@class='authors'][not(contains(text(),''))]")
author_names = [a.text for a in authors]
# 获取期刊来源
source = row.find_element(By.XPATH, "td[4]/span").text
# 获取期刊时间
date = row.find_element(By.XPATH, 'td[5]').text
# 获取期刊引用次数
quote= row.find_element(By.XPATH, 'td[7]').text
# 获取期刊下载次数
download = row.find_element(By.XPATH, 'td[8]').text
print("类型:", type_text)
print("论文名称", title)
print("作者列表:", author_names)
print("期刊来源:", source)
print("时间:", date)
print("引用次数:", quote)
print("下载次数:", download)
time.sleep(1)
origin = driver.current_window_handle
existing_handles = driver.window_handles
try:
_scroll_into_view(driver, title_element)
title_element.click()
except Exception:
try:
ActionChains(driver).move_to_element(title_element).pause(0.1).click(title_element).perform()
except Exception:
driver.execute_script("arguments[0].click();", title_element)
try:
WebDriverWait(driver, 10).until(EC.new_window_is_opened(existing_handles))
except TimeoutException:
print("[警告] 未检测到新窗口,跳过")
return None
try:
detail_tab = find_valid_detail_tab(driver, origin)
if detail_tab not in driver.window_handles:
return None
driver.switch_to.window(detail_tab)
time.sleep(1)
originalLink = driver.current_url
print("详情页链接:", originalLink)
# 尝试获取摘要
summary_text = ""
try:
summary_container = driver.find_element(By.CSS_SELECTOR, "#essential > div.detailList > div.summary.list")
text_span = summary_container.find_element(By.CSS_SELECTOR, "span.text-overflow > span > span")
summary_text = text_span.text
except Exception:
# 没找到元素,就保持 summary_text 为空
print("[警告] 摘要信息未找到")
# 判断折叠按钮
try:
expand_btn = summary_container.find_element(By.CSS_SELECTOR,
"span.slot-box > span.abstractIcon.btn[title='查看全部']")
driver.execute_script("arguments[0].click();", expand_btn)
time.sleep(1)
summary_text = text_span.text
except Exception:
pass
print("摘要:", summary_text)
#获取关键词信息
# 定位关键词容器
keyword_container = driver.find_element(By.CSS_SELECTOR, "#essential > div.detailList > div.keyword.list")
# 定位里面所有的 span
keyword_elements = keyword_container.find_elements(By.CSS_SELECTOR, "div.itemKeyword a span")
# 提取文本
keywords = [el.text.strip() for el in keyword_elements]
print("关键词:", keywords)
time.sleep(1)
finally:
# 仅关闭非原窗口
if driver.current_window_handle != origin:
driver.close()
driver.switch_to.window(origin)
time.sleep(random.uniform(0.5, 1.5))
return {
"title": title, # 确保函数里有定义
"author": author_names,
"source": source,
"date": date,
"site":"万方",
"quote": quote,
"originalLink": originalLink,
"download": download,
"keywords": keywords,
"summary": summary_text
}
def crawl_current_sort(driver, limit):
"""抓取当前排序下的 limit 条记录"""
fetched_count = 0
results = []
while fetched_count < limit:
try:
WebDriverWait(driver, 10).until(
EC.presence_of_element_located((By.CSS_SELECTOR, '#anxs-logoName_sns'))
)
except TimeoutException:
print("[警告] 本页结果表格未出现,尝试继续")
time.sleep(2)
rows = driver.find_elements(By.XPATH, '/ html / body / div[5] / div / div[3] / div[2] / div / div[4] / div[2] / div[1] / table / tbody / tr')
for row in rows:
if fetched_count >= limit:
break
try:
info = extract_row_info(row, driver)
if info:
results.append(info)
time.sleep(2)
fetched_count += 1
except Exception as e:
print(f"[错误] {e}")
traceback.print_exc()
try:
if driver.window_handles:
driver.switch_to.window(driver.window_handles[0])
except Exception:
pass
# 翻页
try:
next_btn = driver.find_element(By.XPATH, "/html/body/div[5]/div/div[3]/div[2]/div/div[3]/div[2]/div[4]/span[3]")
if not next_btn.is_enabled() or fetched_count >= limit:
break
_scroll_into_view(driver, next_btn)
try:
next_btn.click()
except Exception:
try:
ActionChains(driver).move_to_element(next_btn).pause(0.1).click(next_btn).perform()
except Exception:
driver.execute_script("arguments[0].click();", next_btn)
time.sleep(1)
except Exception:
print("[INFO] 已到最后一页或翻页失败")
break
return results
def wangfang(keyword, limit, sort_options=None):
"""主函数:三种排序抓取"""
driver = create_browser()
wait = WebDriverWait(driver, 15)
all_results = {}
if not sort_options:
sort_options = ["relevance"] # 默认相关性
try:
driver.get("https://www.wanfangdata.com.cn/index.html")
wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, "#search-input"))).send_keys(keyword)
driver.find_element(By.CLASS_NAME, "search-icon").click()
time.sleep(1)
#切换展示模式
element=driver.find_element(By.CLASS_NAME, "toggle-table-list")
driver.execute_script("arguments[0].scrollIntoView({block: 'center'});", element)
time.sleep(2)
element.click()
for sort_name in sort_options:
if sort_name == "relevance":
print("[INFO] 使用相关性排序(默认)")
elif sort_name == "download_count":
print("[INFO] 使用下载量排序")
try:
driver.find_element(By.XPATH, '//span[text()="被引频次"]').click()
except Exception:
print(f"[WARN] 点击排序 {sort_name} 失败")
elif sort_name == "publication_time":
print("[INFO] 使用时间排序")
try:
driver.find_element(By.XPATH, '//span[text()="出版时间"]').click()
except Exception:
print(f"[WARN] 点击排序 {sort_name} 失败")
time.sleep(1)
results = crawl_current_sort(driver, limit)
all_results[sort_name] = results
finally:
try:
driver.quit()
except Exception:
pass
print("[DONE] PDF处理完成")
print(json.dumps(all_results, ensure_ascii=False, indent=2))
return all_results
if __name__ == '__main__':
keyword = "知识图谱"
limit=100
wangfang(keyword,limit, ["relevance", "publication_time"])

280
selenium/utils/weipu.py Executable file
View File

@ -0,0 +1,280 @@
# coding=utf-8
import json
import time
import random
import sys
import os
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
import traceback
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException, NoSuchWindowException, WebDriverException
from selenium.webdriver.common.action_chains import ActionChains
from config import create_browser,_scroll_into_view
def find_valid_detail_tab(driver, origin_handle, timeout=10):
"""从现有句柄中挑出一个不是原始窗口的有效新标签页。"""
end_time = time.time() + timeout
while time.time() < end_time:
for handle in driver.window_handles:
if handle != origin_handle:
try:
driver.switch_to.window(handle)
current_url = driver.current_url
if not current_url.startswith("chrome://") and current_url != "about:blank":
print(f"[维普切换窗口] 成功 → {driver.title}")
return handle
except Exception:
pass
time.sleep(0.5)
raise Exception("未能在规定时间内找到有效详情页窗口")
# ---------主函数 ---------
def extract_row_info(row, driver):
"""抓取单条记录信息并返回字典"""
try:
type_text = row.find_element(By.XPATH, 'td[5]/div/span').text.strip()
except Exception:
type_text = ""
# 如果不是期刊论文,直接跳过
if type_text != "期刊论文":
return None
try:
title_element = row.find_element(By.XPATH, 'td[2]/div/div/a')
title = title_element.text.strip()
print("论文名称:", title)
except Exception:
print("[错误] 标题元素未找到")
return None
try:
author_elems = row.find_elements(
By.XPATH,
".//div[contains(@class,'six-wrap')]//*[@data-warden-event-id='author-click']"
)
authors = [e.text.strip() for e in author_elems if e.text.strip()]
except Exception:
authors = []
print("作者列表:", authors)
try:
source = row.find_element(By.XPATH, "td[4]/div/a").text
except Exception:
source = ""
print("期刊来源:", source)
print("类型:", type_text)
time.sleep(1)
try:
origin = driver.current_window_handle
except NoSuchWindowException:
print("[错误] 当前窗口不存在")
return None
existing_handles = driver.window_handles
# 点击标题打开新窗口
clicked = False
try:
_scroll_into_view(driver, title_element)
title_element.click()
clicked = True
except Exception:
try:
ActionChains(driver).move_to_element(title_element).pause(1).click(title_element).perform()
clicked = True
except Exception:
try:
driver.execute_script("arguments[0].click();", title_element)
clicked = True
except Exception:
print("[错误] 点击标题失败")
clicked = False
if not clicked:
return None
try:
WebDriverWait(driver, 10).until(EC.new_window_is_opened(existing_handles))
except TimeoutException:
print("[警告] 未检测到新窗口,跳过")
return None
try:
# 获取新窗口句柄
new_handles = driver.window_handles
detail_tab = next((h for h in new_handles if h != origin), None)
if not detail_tab:
print("[警告] 找不到新窗口")
return None
driver.switch_to.window(detail_tab)
time.sleep(1)
try:
originalLink = driver.current_url
print("详情页链接:", originalLink)
except NoSuchWindowException:
print("[错误] 新窗口已关闭")
return None
# 获取摘要
summary_text = ""
try:
abstract_elems = driver.find_elements(By.CSS_SELECTOR, "span.ellipsis.content-text")
if abstract_elems:
summary_text = abstract_elems[0].text.strip()
else:
print("[警告] 摘要信息未找到")
except Exception:
summary_text = ""
print("摘要:", summary_text)
# 获取关键词
keywords = []
try:
keyword_container = driver.find_element(By.XPATH, "//div[contains(., '关键词')]")
keyword_spans = keyword_container.find_elements(By.CSS_SELECTOR, "span.select_hover.pointer span")
keywords = [k.text.strip() for k in keyword_spans if k.text.strip()]
except Exception:
keywords = []
print("关键词列表:", keywords)
time.sleep(1)
except (NoSuchWindowException, WebDriverException):
print("[警告] 窗口操作失败")
return None
finally:
# 安全关闭新窗口
try:
if driver.current_window_handle != origin:
driver.close()
driver.switch_to.window(origin)
time.sleep(random.uniform(0.5, 1.0))
except (NoSuchWindowException, WebDriverException):
print("[警告] 无法切回原窗口")
return {
"title": title,
"author": authors,
"source": source,
"site":"维普",
"keywords": keywords,
"originalLink": originalLink if 'originalLink' in locals() else "",
"summary": summary_text
}
def crawl_current_sort(driver, limit):
"""抓取当前排序下的 limit 条记录"""
fetched_count = 0
results = []
while fetched_count < limit:
try:
WebDriverWait(driver, 10).until(
EC.presence_of_element_located((By.CSS_SELECTOR, '#search_container > div.s-list > div.yx-start.content.al-str'))
)
except TimeoutException:
print("[警告] 本页结果表格未出现,尝试继续")
time.sleep(2)
rows = driver.find_elements(By.XPATH, '//*[@id="search_container"]/div[2]/div[2]/div/div/div[3]/table/tbody/tr')
for row in rows:
if fetched_count >= limit:
break
try:
info = extract_row_info(row, driver)
if info:
results.append(info)
time.sleep(2)
fetched_count += 1
except Exception as e:
print(f"[错误] {e}")
traceback.print_exc()
try:
if driver.window_handles:
driver.switch_to.window(driver.window_handles[0])
except Exception:
pass
# 翻页
try:
next_btn = driver.find_element(By.CSS_SELECTOR, "i.el-icon-arrow-right.pointer")
if not next_btn.is_enabled() or fetched_count >= limit:
break
_scroll_into_view(driver, next_btn)
try:
next_btn.click()
except Exception:
try:
ActionChains(driver).move_to_element(next_btn).pause(0.1).click(next_btn).perform()
except Exception:
driver.execute_script("arguments[0].click();", next_btn)
time.sleep(5)
except Exception:
print("[INFO] 已到最后一页或翻页失败")
break
return results
def weipu(keyword, limit,sort_options=None):
"""主函数:三种排序抓取"""
driver = create_browser()
wait = WebDriverWait(driver, 15)
all_results = {}
if not sort_options:
sort_options = ["relevance"] # 默认相关性
try:
driver.get("https://www.cqvip.com/")
search_input = driver.find_element(By.XPATH, "//input[@placeholder='请输入检索词']")
search_input.send_keys(keyword)
time.sleep(2)
search_button = driver.find_element(By.XPATH, "//button[.//span[contains(text(),'检索')]]")
search_button.click()
time.sleep(3)
#切换展示模式
element = WebDriverWait(driver, 10).until(
EC.element_to_be_clickable(
(By.CSS_SELECTOR, 'i[data-warden-event-id="list-arrange"]')
)
)
driver.execute_script("arguments[0].scrollIntoView({block: 'center'});", element)
time.sleep(2)
element.click()
time.sleep(2)
for sort_name in sort_options:
if sort_name == "relevance":
print("[INFO] 使用相关性排序(默认)")
elif sort_name == "download_count":
print("[INFO] 使用被引量排序")
try:
driver.find_element(By.XPATH, '//span[contains(text(),"被引量")]').click()
except Exception:
print(f"[WARN] 点击排序 {sort_name} 失败")
elif sort_name == "publication_time":
print("[INFO] 使用时间排序")
try:
driver.find_element(By.XPATH, '//span[contains(text(),"时效性")]').click()
except Exception:
print(f"[WARN] 点击排序 {sort_name} 失败")
time.sleep(1)
results = crawl_current_sort(driver, limit)
all_results[sort_name] = results
finally:
try:
driver.quit()
except Exception:
pass
print("[DONE] PDF处理完成")
print(json.dumps(all_results, ensure_ascii=False, indent=2))
return all_results
if __name__ == '__main__':
keyword = "深度学习"
limit=10
weipu(keyword,limit,["relevance"])

271
selenium/utils/zhiwang.py Executable file
View File

@ -0,0 +1,271 @@
# coding=utf-8
import csv
import json
import sys
import os
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
import time
import random
import traceback
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException, NoSuchWindowException, WebDriverException
from selenium.webdriver.common.action_chains import ActionChains
from config import create_browser,_scroll_into_view
def find_valid_detail_tab(driver, origin_handle, timeout=10):
"""从现有句柄中挑出一个不是 chrome:// 且标题包含知网的标签页。"""
end_time = time.time() + timeout
while time.time() < end_time:
for handle in driver.window_handles:
if handle != origin_handle:
try:
driver.switch_to.window(handle)
current_url = driver.current_url
current_title = driver.title
if not current_url.startswith("chrome://") and ("知网" in current_title or "CNKI" in current_title.upper()):
print(f"[知网切换窗口] 成功 → {current_title}")
return handle
except Exception:
pass
time.sleep(0.5)
raise Exception("未能在规定时间内找到有效详情页窗口")
# ---------主函数 ---------
def extract_row_info(row, driver):
"""抓取单条记录信息并返回字典"""
td_name = None
for _ in range(3):
try:
td_name = row.find_element(By.CSS_SELECTOR, 'td.name')
break
except Exception:
time.sleep(0.3)
if not td_name:
return None
a_tags = td_name.find_elements(By.TAG_NAME, 'a')
if not a_tags:
return None
link_elem = a_tags[0]
title = (link_elem.text or "").strip()
if not title:
return None
try:
author = row.find_element(By.CSS_SELECTOR, 'td.author').text
except Exception:
author = ""
try:
source = row.find_element(By.CSS_SELECTOR, 'td.source').text
except Exception:
source = ""
try:
date = row.find_element(By.CSS_SELECTOR, 'td.date').text
except Exception:
date = ""
try:
quote = row.find_element(By.CSS_SELECTOR, 'td.quote').text
except Exception:
quote = ""
try:
download = row.find_element(By.CSS_SELECTOR, 'td.download').text
except Exception:
download = ""
print(f"作者:{author}")
print(f"来源:{source}")
print(f"出版时间:{date}")
print(f"被引频次:{quote}")
print(f"下载次数:{download}")
print("-" * 50)
try:
origin = driver.current_window_handle
except Exception:
print("[警告] 当前窗口不可用")
return None
existing_handles = driver.window_handles.copy()
try:
_scroll_into_view(driver, link_elem)
link_elem.click()
except Exception:
try:
ActionChains(driver).move_to_element(link_elem).pause(0.1).click(link_elem).perform()
except Exception:
try:
driver.execute_script("arguments[0].click();", link_elem)
except Exception:
print("[警告] 点击失败")
return None
try:
WebDriverWait(driver, 10).until(EC.new_window_is_opened(existing_handles))
except TimeoutException:
print("[警告] 未检测到新窗口,跳过")
return None
originalLink = ""
keywords = []
summary = ""
try:
detail_tab = find_valid_detail_tab(driver, origin)
if detail_tab not in driver.window_handles:
print("[警告] 新窗口不存在")
return None
try:
driver.switch_to.window(detail_tab)
time.sleep(0.5)
originalLink = driver.current_url
except Exception:
print("[警告] 无法切换到新窗口")
return None
try:
keywords = [kw.text for kw in driver.find_elements(
By.XPATH,
"//span[@class='rowtit' and text()='关键词:']/following-sibling::p[@class='keywords']/a"
)]
except Exception:
keywords = []
try:
summary = driver.find_element(By.XPATH, '//*[@id="ChDivSummary"]').text
except Exception:
summary = ""
print(f"关键词{keywords}")
print(f"摘要{summary}")
finally:
try:
if driver.current_window_handle != origin:
driver.close()
except Exception:
pass
try:
driver.switch_to.window(origin)
except Exception:
print("[警告] 无法切回原窗口")
time.sleep(random.uniform(0.5, 1.5))
return {
"title": title,
"author": author,
"source": source,
"date": date,
"site": "知网",
"originalLink": originalLink,
"quote": quote,
"download": download,
"keywords": keywords,
"summary": summary
}
def crawl_current_sort(driver, limit):
"""抓取当前排序下的 limit 条记录"""
fetched_count = 0
results = []
while fetched_count < limit:
try:
WebDriverWait(driver, 10).until(
EC.presence_of_element_located((By.CSS_SELECTOR, '#gridTable table tbody'))
)
except TimeoutException:
print("[警告] 本页结果表格未出现,尝试继续")
time.sleep(1)
rows = driver.find_elements(By.CSS_SELECTOR, '#gridTable > div > div > div > table > tbody > tr')
for row in rows:
if fetched_count >= limit:
break
try:
info = extract_row_info(row, driver)
if info:
results.append(info)
fetched_count += 1
print(f"[{fetched_count}] {info['title']}")
except Exception as e:
print(f"[错误] {e}")
traceback.print_exc()
try:
if driver.window_handles:
driver.switch_to.window(driver.window_handles[0])
except Exception:
pass
# 翻页
try:
next_btn = driver.find_element(By.ID, "PageNext")
if not next_btn.is_enabled() or fetched_count >= limit:
break
_scroll_into_view(driver, next_btn)
try:
next_btn.click()
except Exception:
try:
ActionChains(driver).move_to_element(next_btn).pause(0.1).click(next_btn).perform()
except Exception:
driver.execute_script("arguments[0].click();", next_btn)
time.sleep(1)
except Exception:
print("[INFO] 已到最后一页或翻页失败")
break
return results
def zhiwang(keyword, limit,sort_options=None):
"""主函数:四种排序抓取"""
print(f"[DEBUG][zhiwang] Received parameters: keyword='{keyword}', limit={limit}, sort_options={sort_options}")
driver = create_browser()
wait = WebDriverWait(driver, 15)
all_results = {}
if not sort_options:
sort_options = ["publication_time"] # 默认相关性
try:
driver.get("https://www.cnki.net")
wait.until(EC.presence_of_element_located((By.ID, "txt_SearchText"))).send_keys(keyword)
driver.find_element(By.CLASS_NAME, "search-btn").click()
time.sleep(2)
for sort_name in sort_options:
if sort_name == "publication_time":
print("[INFO] 使用发表时间排序(默认)")
elif sort_name == "download_count":
print("[INFO] 使用下载量排序")
try:
download = driver.find_element(By.XPATH, '//ul[@id="orderList"]/li[text()="下载"]')
download.click()
except Exception:
print(f"[WARN] 点击排序 {sort_name} 失败")
elif sort_name == "cited_count":
print("[INFO] 使用被引量排序")
try:
download = driver.find_element(By.XPATH, '//ul[@id="orderList"]/li[text()="被引"]')
download.click()
except Exception:
print(f"[WARN] 点击排序 {sort_name} 失败")
elif sort_name == "relevance":
print("[INFO] 使用相关度排序")
try:
relevance = driver.find_element(By.XPATH, '//ul[@id="orderList"]/li[text()="相关度"]')
relevance.click()
except Exception:
print(f"[WARN] 点击排序 {sort_name} 失败")
time.sleep(1)
results = crawl_current_sort(driver, limit)
all_results[sort_name] = results
finally:
try:
driver.quit()
except Exception:
pass
print("[DONE] PDF处理完成")
print(json.dumps(all_results, ensure_ascii=False, indent=2))
return all_results
if __name__ == '__main__':
keyword = "graphrag"
limit=10
zhiwang(keyword,limit,["relevance", "publication_time"])

@ -1 +0,0 @@
Subproject commit 891bb94c9c0424de4aac9c4112a3c000bed7af87

10
selenium_django/.env.example Executable file
View File

@ -0,0 +1,10 @@
SECRET_KEY=your-secret-key
DEBUG=True
ALLOWED_HOSTS=127.0.0.1,localhost
DB_ENGINE=django.db.backends.sqlite3
DB_NAME=db.sqlite3
DB_USER=
DB_PASSWORD=
DB_HOST=
DB_PORT=

8
selenium_django/.idea/.gitignore vendored Executable file
View File

@ -0,0 +1,8 @@
# Default ignored files
/shelf/
/workspace.xml
# Editor-based HTTP Client requests
/httpRequests/
# Datasource local storage ignored files
/dataSources/
/dataSources.local.xml

View File

@ -0,0 +1,79 @@
<component name="InspectionProjectProfileManager">
<profile version="1.0">
<option name="myName" value="Project Default" />
<inspection_tool class="Eslint" enabled="true" level="WARNING" enabled_by_default="true" />
<inspection_tool class="PyPackageRequirementsInspection" enabled="true" level="WARNING" enabled_by_default="true">
<option name="ignoredPackages">
<value>
<list size="57">
<item index="0" class="java.lang.String" itemvalue="tqdm" />
<item index="1" class="java.lang.String" itemvalue="scipy" />
<item index="2" class="java.lang.String" itemvalue="h5py" />
<item index="3" class="java.lang.String" itemvalue="matplotlib" />
<item index="4" class="java.lang.String" itemvalue="torch" />
<item index="5" class="java.lang.String" itemvalue="numpy" />
<item index="6" class="java.lang.String" itemvalue="torchvision" />
<item index="7" class="java.lang.String" itemvalue="opencv_python" />
<item index="8" class="java.lang.String" itemvalue="Pillow" />
<item index="9" class="java.lang.String" itemvalue="charset-normalizer" />
<item index="10" class="java.lang.String" itemvalue="torchaudio" />
<item index="11" class="java.lang.String" itemvalue="tokenizers" />
<item index="12" class="java.lang.String" itemvalue="transformers" />
<item index="13" class="java.lang.String" itemvalue="referencing" />
<item index="14" class="java.lang.String" itemvalue="tzlocal" />
<item index="15" class="java.lang.String" itemvalue="alibabacloud_openapi_util" />
<item index="16" class="java.lang.String" itemvalue="python-dateutil" />
<item index="17" class="java.lang.String" itemvalue="cffi" />
<item index="18" class="java.lang.String" itemvalue="alibabacloud-dingtalk" />
<item index="19" class="java.lang.String" itemvalue="MarkupSafe" />
<item index="20" class="java.lang.String" itemvalue="Jinja2" />
<item index="21" class="java.lang.String" itemvalue="frozenlist" />
<item index="22" class="java.lang.String" itemvalue="jsonschema-specifications" />
<item index="23" class="java.lang.String" itemvalue="exceptiongroup" />
<item index="24" class="java.lang.String" itemvalue="alibabacloud-credentials" />
<item index="25" class="java.lang.String" itemvalue="alibabacloud_gateway_dingtalk" />
<item index="26" class="java.lang.String" itemvalue="certifi" />
<item index="27" class="java.lang.String" itemvalue="anyio" />
<item index="28" class="java.lang.String" itemvalue="alibabacloud-credentials-api" />
<item index="29" class="java.lang.String" itemvalue="et_xmlfile" />
<item index="30" class="java.lang.String" itemvalue="alibabacloud_tea_openapi" />
<item index="31" class="java.lang.String" itemvalue="jsonschema" />
<item index="32" class="java.lang.String" itemvalue="darabonba-core" />
<item index="33" class="java.lang.String" itemvalue="flask-restx" />
<item index="34" class="java.lang.String" itemvalue="importlib_resources" />
<item index="35" class="java.lang.String" itemvalue="alibabacloud_tea_util" />
<item index="36" class="java.lang.String" itemvalue="aiofiles" />
<item index="37" class="java.lang.String" itemvalue="aiohappyeyeballs" />
<item index="38" class="java.lang.String" itemvalue="cryptography" />
<item index="39" class="java.lang.String" itemvalue="alibabacloud_gateway_spi" />
<item index="40" class="java.lang.String" itemvalue="APScheduler" />
<item index="41" class="java.lang.String" itemvalue="attrs" />
<item index="42" class="java.lang.String" itemvalue="chardet" />
<item index="43" class="java.lang.String" itemvalue="pandas" />
<item index="44" class="java.lang.String" itemvalue="alibabacloud-tea" />
<item index="45" class="java.lang.String" itemvalue="lark-oapi" />
<item index="46" class="java.lang.String" itemvalue="colorama" />
<item index="47" class="java.lang.String" itemvalue="aiohttp" />
<item index="48" class="java.lang.String" itemvalue="multidict" />
<item index="49" class="java.lang.String" itemvalue="yarl" />
<item index="50" class="java.lang.String" itemvalue="aiosignal" />
<item index="51" class="java.lang.String" itemvalue="idna" />
<item index="52" class="java.lang.String" itemvalue="openpyxl" />
<item index="53" class="java.lang.String" itemvalue="requests-toolbelt" />
<item index="54" class="java.lang.String" itemvalue="pymysql" />
<item index="55" class="java.lang.String" itemvalue="poplib" />
<item index="56" class="java.lang.String" itemvalue="sqlalchemy" />
</list>
</value>
</option>
</inspection_tool>
<inspection_tool class="PyPep8NamingInspection" enabled="true" level="WEAK WARNING" enabled_by_default="true">
<option name="ignoredErrors">
<list>
<option value="N806" />
</list>
</option>
</inspection_tool>
<inspection_tool class="TsLint" enabled="true" level="WARNING" enabled_by_default="true" />
</profile>
</component>

View File

@ -0,0 +1,6 @@
<component name="InspectionProjectProfileManager">
<settings>
<option name="USE_PROJECT_PROFILE" value="false" />
<version value="1.0" />
</settings>
</component>

4
selenium_django/.idea/misc.xml Executable file
View File

@ -0,0 +1,4 @@
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="ProjectRootManager" version="2" project-jdk-name="Python 3.10 (selenium_django)" project-jdk-type="Python SDK" />
</project>

View File

@ -0,0 +1,8 @@
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="ProjectModuleManager">
<modules>
<module fileurl="file://$PROJECT_DIR$/.idea/selenium_django.iml" filepath="$PROJECT_DIR$/.idea/selenium_django.iml" />
</modules>
</component>
</project>

View File

@ -0,0 +1,27 @@
<?xml version="1.0" encoding="UTF-8"?>
<module type="PYTHON_MODULE" version="4">
<component name="FacetManager">
<facet type="django" name="Django">
<configuration>
<option name="rootFolder" value="$MODULE_DIR$" />
<option name="settingsModule" value="selenium_django/settings.py" />
<option name="manageScript" value="$MODULE_DIR$/manage.py" />
<option name="environment" value="&lt;map/&gt;" />
<option name="doNotUseTestRunner" value="false" />
<option name="trackFilePattern" value="migrations" />
</configuration>
</facet>
</component>
<component name="NewModuleRootManager">
<content url="file://$MODULE_DIR$" />
<orderEntry type="jdk" jdkName="Python 3.10 (selenium_django)" jdkType="Python SDK" />
<orderEntry type="sourceFolder" forTests="false" />
</component>
<component name="PyDocumentationSettings">
<option name="format" value="PLAIN" />
<option name="myDocStringFormat" value="Plain" />
</component>
<component name="TemplatesService">
<option name="TEMPLATE_CONFIGURATION" value="Django" />
</component>
</module>

32
selenium_django/Dockerfile Executable file
View File

@ -0,0 +1,32 @@
FROM python:3.11-slim
WORKDIR /app
# 设置国内 pip 镜像
RUN pip config set global.index-url https://pypi.tuna.tsinghua.edu.cn/simple
# 复制依赖并安装
COPY requirements.txt .
RUN pip install --no-cache-dir -r requirements.txt
# 复制项目
COPY . .
# 暴露端口
EXPOSE 8000
# 设置默认环境变量
ENV CELERY_BROKER_URL=${CELERY_BROKER_URL:-redis://redis:6379/0}
ENV CELERY_RESULT_BACKEND=${CELERY_RESULT_BACKEND:-redis://redis:6379/0}
ENV CRAWL_API_URL=${CRAWL_API_URL:-http://47.83.141.164:5001/crawl}
# 在构建时替换 settings.py 中的配置
RUN sed -i "s#CELERY_BROKER_URL = .*#CELERY_BROKER_URL = '${CELERY_BROKER_URL}'#" selenium_django/settings.py && \
sed -i "s#CELERY_RESULT_BACKEND = .*#CELERY_RESULT_BACKEND = '${CELERY_RESULT_BACKEND}'#" selenium_django/settings.py && \
sed -i "s#CRAWL_API_URL = .*#CRAWL_API_URL = '${CRAWL_API_URL}'#" selenium_django/settings.py
# 入口脚本
COPY entrypoint.sh /entrypoint.sh
RUN chmod +x /entrypoint.sh
CMD ["/entrypoint.sh"]

22
selenium_django/Readme Executable file
View File

@ -0,0 +1,22 @@
第一步:启动 Redis 服务器Celery Broker
Celery 默认使用 Redis 作为消息队列。
需要 修改selenium_django/settings.py里面的配置所有配置均在这里修改
第二步:启动 Celery Worker
Celery Worker 用于执行异步任务 trigger_task_execution。
执行celery -A selenium_django worker --loglevel=info
celery -A selenium_django worker -l info --pool=solo
第三步:启动 Django 开发服务器
python manage.py runserver
docker build -t selenium-django .
docker run -d \
--name selenium-django-container \
-p 8001:8000 \
selenium-django

View File

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

3
selenium_django/api/admin.py Executable file
View File

@ -0,0 +1,3 @@
from django.contrib import admin
# Register your models here.

14
selenium_django/api/apps.py Executable file
View File

@ -0,0 +1,14 @@
# api/apps.py
from django.apps import AppConfig
class ApiConfig(AppConfig):
default_auto_field = 'django.db.models.BigAutoField'
name = 'api'
def ready(self):
# import os
# if os.environ.get('RUN_MAIN') == 'true': # 只在主进程启动
from .scheduler import start_scheduler
print("Scheduler 启动了吗?")
start_scheduler()
print("Scheduler 已启动")

View File

@ -0,0 +1,95 @@
# Generated by Django 5.2.6 on 2025-09-11 02:54
import django.db.models.deletion
from django.db import migrations, models
class Migration(migrations.Migration):
initial = True
dependencies = []
operations = [
migrations.CreateModel(
name="Task",
fields=[
(
"id",
models.BigAutoField(
auto_created=True,
primary_key=True,
serialize=False,
verbose_name="ID",
),
),
("task_id", models.CharField(max_length=64, unique=True)),
("name", models.CharField(max_length=200)),
("description", models.TextField(blank=True, null=True)),
(
"execution_type",
models.CharField(
choices=[("scheduled", "定期执行"), ("predefined", "预定时间执行")],
max_length=20,
),
),
("execution_time", models.DateTimeField(blank=True, null=True)),
(
"scheduled_time",
models.CharField(blank=True, max_length=10, null=True),
),
("parse_flag", models.BooleanField(default=False)),
("limit", models.IntegerField(default=60)),
(
"status",
models.CharField(
choices=[
("running", "进行中"),
("idle", "空闲中"),
("done", "完成"),
("failed", "失败"),
],
default="idle",
max_length=20,
),
),
("created_at", models.DateTimeField(auto_now_add=True)),
("updated_at", models.DateTimeField(auto_now=True)),
],
),
migrations.CreateModel(
name="TaskDetail",
fields=[
(
"id",
models.BigAutoField(
auto_created=True,
primary_key=True,
serialize=False,
verbose_name="ID",
),
),
("author", models.CharField(blank=True, max_length=500)),
("date", models.CharField(blank=True, max_length=100, null=True)),
("download", models.IntegerField(blank=True, null=True)),
("keywords", models.TextField(blank=True)),
("original_link", models.URLField(blank=True)),
("pdf_url", models.URLField(blank=True)),
("quote", models.TextField(blank=True)),
("source", models.CharField(blank=True, max_length=200)),
("site", models.CharField(blank=True, max_length=200)),
("summary", models.TextField(blank=True)),
("parsed_summary", models.JSONField(blank=True, null=True)),
("title", models.CharField(blank=True, max_length=300)),
("created_at", models.DateTimeField(auto_now_add=True)),
(
"task",
models.ForeignKey(
on_delete=django.db.models.deletion.CASCADE,
related_name="details",
to="api.task",
),
),
],
),
]

View File

@ -0,0 +1,23 @@
# Generated by Django 5.2.6 on 2025-09-11 02:57
from django.db import migrations, models
class Migration(migrations.Migration):
dependencies = [
("api", "0001_initial"),
]
operations = [
migrations.AlterField(
model_name="task",
name="execution_type",
field=models.CharField(
blank=True,
choices=[("scheduled", "定期执行"), ("predefined", "预定时间执行")],
max_length=20,
null=True,
),
),
]

View File

@ -0,0 +1,18 @@
# Generated by Django 5.2.6 on 2025-09-11 03:33
from django.db import migrations, models
class Migration(migrations.Migration):
dependencies = [
("api", "0002_alter_task_execution_type"),
]
operations = [
migrations.AddField(
model_name="task",
name="last_run_date",
field=models.DateField(blank=True, null=True),
),
]

View File

55
selenium_django/api/models.py Executable file
View File

@ -0,0 +1,55 @@
from django.db import models
# Create your models here.
from django.db import models
class Task(models.Model):
TASK_STATUS_CHOICES = [
('running', '进行中'),
('idle', '空闲中'),
('done', '完成'),
('failed', '失败'),
]
EXECUTION_TYPE_CHOICES = [
('scheduled', '定期执行'),
('predefined', '预定时间执行'),
]
task_id = models.CharField(max_length=64, unique=True)
name = models.CharField(max_length=200)
description = models.TextField(blank=True, null=True)
last_run_date = models.DateField(null=True, blank=True)
execution_type = models.CharField(
max_length=20,
choices=EXECUTION_TYPE_CHOICES,
blank=True,
null=True
)
# 一次性执行使用 DateTimeField
execution_time = models.DateTimeField(blank=True, null=True)
# 每天执行使用 TimeField
scheduled_time = models.CharField(max_length=10, blank=True, null=True) # 改为字符串 HH:MM
parse_flag = models.BooleanField(default=False)
limit = models.IntegerField(default=60) # ⭐ 新增的字段默认60
status = models.CharField(max_length=20, choices=TASK_STATUS_CHOICES, default='idle')
created_at = models.DateTimeField(auto_now_add=True)
updated_at = models.DateTimeField(auto_now=True)
def __str__(self):
return self.name
class TaskDetail(models.Model):
task = models.ForeignKey(Task, related_name="details", on_delete=models.CASCADE)
author = models.CharField(max_length=500, blank=True)
date = models.CharField(max_length=100, blank=True, null=True) # 改为字符串
download = models.IntegerField(blank=True, null=True)
keywords = models.TextField(blank=True) # 存储 ; 分隔的关键字
original_link = models.URLField(blank=True)
pdf_url = models.URLField(blank=True)
quote = models.TextField(blank=True)
source = models.CharField(max_length=200, blank=True)
site = models.CharField(max_length=200, blank=True)
summary = models.TextField(blank=True)
parsed_summary = models.JSONField(blank=True, null=True) # 存储 JSON
title = models.CharField(max_length=300, blank=True)
created_at = models.DateTimeField(auto_now_add=True)

104
selenium_django/api/scheduler.py Executable file
View File

@ -0,0 +1,104 @@
from apscheduler.schedulers.background import BackgroundScheduler
from apscheduler.triggers.date import DateTrigger
from django.utils import timezone
from datetime import datetime, date
from .models import Task
from .tasks import trigger_task_execution
import logging
logger = logging.getLogger(__name__)
scheduler = BackgroundScheduler(timezone=None) # 使用本地时间
scheduler_started = False
def start_scheduler():
global scheduler_started
if scheduler_started:
return
scheduler_started = True
scheduler.start()
logger.info("APScheduler 启动成功")
# 定期检查一次性任务每30秒
scheduler.add_job(check_predefined_tasks, 'interval', seconds=30)
# 定期检查新创建的每日定时任务每30秒
scheduler.add_job(sync_scheduled_tasks, 'interval', seconds=30)
def check_predefined_tasks():
"""检查一次性任务并触发 Celery 异步执行"""
logger.info("检查一次性任务: 开始")
now = datetime.now() # 使用本地时间
tasks = Task.objects.filter(status='idle', execution_type='predefined')
logger.debug(f"[Predefined] 检查 {len(tasks)} 个一次性任务, 当前时间 {now}")
for task in tasks:
exec_time = task.execution_time
if not exec_time:
logger.warning(f"Task {task.id} 没有设置 execution_time跳过")
continue
# 数据库里已经是本地时间,不需要再做 timezone aware
if exec_time <= now:
try:
# 异步调用 Celery 执行任务,只传 task.id
trigger_task_execution.delay(task.id)
logger.info(f"Task {task.id} 已触发 Celery 异步执行")
# 更新任务状态为 done避免重复触发
task.status = 'done'
task.save(update_fields=['status'])
except Exception as e:
logger.error(f"触发 Task {task.id} 时出错: {e}")
def sync_scheduled_tasks():
"""同步每日定时任务到 APScheduler"""
today = date.today()
now = datetime.now() # 本地时间
tasks = Task.objects.filter(status='idle', execution_type='scheduled')
logger.debug(f"[Scheduled] 检查 {len(tasks)} 个每日任务, 当前时间 {now}")
for task in tasks:
st = task.scheduled_time
if not st:
continue
# 解析时间字符串
try:
scheduled_time_obj = datetime.strptime(st, "%H:%M:%S").time()
except ValueError:
scheduled_time_obj = datetime.strptime(st, "%H:%M").time()
last_run = task.last_run_date
if last_run != today:
# 直接用本地时间,不再 make_aware
exec_datetime = datetime.combine(today, scheduled_time_obj)
job_id = f"scheduled_task_{task.id}"
if not scheduler.get_job(job_id):
scheduler.add_job(
run_scheduled_task,
trigger=DateTrigger(run_date=exec_datetime),
id=job_id,
args=[task.id],
replace_existing=True,
misfire_grace_time=1 # 只允许 1 秒的延迟,超过就跳过
)
def run_scheduled_task(task_id):
"""执行每日定时任务"""
try:
task = Task.objects.get(id=task_id)
except Task.DoesNotExist:
logger.warning(f"[Scheduled] Task {task_id} 不存在")
return
try:
trigger_task_execution.delay(task.id)
logger.info(f"[Scheduled] Task {task.id} 已触发 Celery 执行")
task.last_run_date = date.today()
task.save(update_fields=['last_run_date'])
except Exception as e:
logger.error(f"[Scheduled] 执行 Task {task.id} 出错: {e}")

View File

@ -0,0 +1,33 @@
from rest_framework import serializers
from .models import Task
from rest_framework import serializers
from .models import Task, TaskDetail
class TaskDetailSerializer(serializers.ModelSerializer):
class Meta:
model = TaskDetail
fields = "__all__"
class TaskListSerializer(serializers.ModelSerializer):
class Meta:
model = Task
fields = [
'id', # ✅ 添加这个
'task_id', 'name', 'description', 'last_run_date', 'execution_type',
'execution_time', 'scheduled_time', 'parse_flag', 'limit',
'status', 'created_at', 'updated_at'
]
# 详情接口用的完整 Serializer包含 details
class TaskSerializer(serializers.ModelSerializer):
# details = TaskDetailSerializer(many=True, read_only=True)
class Meta:
model = Task
fields = [
'id', # ✅ 添加这个
'task_id', 'name', 'description', 'last_run_date', 'execution_type',
'execution_time', 'scheduled_time', 'parse_flag', 'limit',
'status', 'created_at', 'updated_at'
]

140
selenium_django/api/tasks.py Executable file
View File

@ -0,0 +1,140 @@
# tasks.py
import requests
from datetime import datetime, date
from django.db import transaction
from .models import Task, TaskDetail
from django.utils import timezone
import threading
import time
from celery import shared_task
from selenium_django.settings import CRAWL_API_URL
def safe_dict_get(d, key, default=None):
"""安全获取字典 key"""
if isinstance(d, dict):
return d.get(key, default)
return default
@shared_task(bind=True, max_retries=3, default_retry_delay=60)
def trigger_task_execution(self, task_id):
"""异步执行单个任务"""
task = None
try:
# 获取任务
task = Task.objects.get(id=task_id)
task.status = 'running'
task.save(update_fields=['status'])
print(f"任务 {task_id} 状态更新为 running")
# 爬虫请求
payload = {
"texts": task.description,
"parse": task.parse_flag,
"limit": task.limit
}
try:
resp = requests.post(CRAWL_API_URL, json=payload, timeout=30000)
resp.raise_for_status()
except requests.RequestException as e:
print(f"Task {task_id} 爬虫请求失败: {e}")
raise self.retry(exc=e)
# 安全解析 JSON
try:
data = resp.json()
if not isinstance(data, dict):
print(f"Task {task_id} 返回数据不是字典,用空 dict 代替")
data = {}
except ValueError:
print(f"Task {task_id} 返回非 JSON 数据: {resp.text[:200]}")
data = {}
# code==20000 说明提取失败
if safe_dict_get(data, "code") == 20000:
print(f"Task {task_id} 爬虫返回 code=20000, message={data.get('message')}")
return {"success": False, "message": data.get("message", "提取不到关键词")}
# 保存任务详情
results = safe_dict_get(data, "results", [])
if not isinstance(results, list):
results = []
with transaction.atomic():
for idx, item in enumerate(results, start=1):
if not isinstance(item, dict):
print(f"Task {task_id} results 第 {idx} 个元素不是字典,跳过")
continue
download_val = item.get("download") or 0
try:
download_val = int(download_val)
except (ValueError, TypeError):
download_val = 0
date_val = str(item.get("date")) if item.get("date") else None
author_val = item.get("author")
if isinstance(author_val, list):
author_val = ';'.join(author_val)
elif author_val is None:
author_val = ''
keywords_val = item.get("keywords")
if isinstance(keywords_val, list):
keywords_val = ';'.join(keywords_val)
else:
keywords_val = ''
pdf_url = item.get("pdfUrl") or ''
parsed_summary = item.get("parsed_summary") or {}
quote_val = item.get("quote") or ''
site_val = item.get("site") or ''
source_val = item.get("source") or ''
summary_val = item.get("summary") or ''
title_val = item.get("title") or ''
original_link = item.get("originalLink") or ''
# 保存 TaskDetail单条失败不影响其他条
try:
TaskDetail.objects.get_or_create(
task=task,
original_link=original_link,
defaults={
'author': author_val,
'date': date_val,
'download': download_val,
'keywords': keywords_val,
'pdf_url': pdf_url,
'parsed_summary': parsed_summary,
'quote': quote_val,
'site': site_val,
'source': source_val,
'summary': summary_val,
'title': title_val
}
)
print(f"Task {task_id} 保存第 {idx} 条结果成功")
except Exception as e:
print(f"Task {task_id} 保存第 {idx} 条结果失败: {e}")
continue
# 更新任务状态为 done
task.status = 'done'
task.save(update_fields=['status'])
print(f"任务 {task_id} 执行完成")
except Task.DoesNotExist:
print(f"Task {task_id} 不存在")
except Exception as e:
print(f"Task {task_id} 执行失败: {e}")
try:
if task:
task.status = 'failed'
task.save(update_fields=['status'])
except Exception as e2:
print(f"更新任务失败状态失败: {e2}")
raise self.retry(exc=e)

3
selenium_django/api/tests.py Executable file
View File

@ -0,0 +1,3 @@
from django.test import TestCase
# Create your tests here.

208
selenium_django/api/views.py Executable file
View File

@ -0,0 +1,208 @@
import asyncio
import json
from django.http import StreamingHttpResponse
from django.shortcuts import render
import aiohttp
from rest_framework.decorators import api_view
import asyncio
from django_filters.rest_framework import DjangoFilterBackend
# Create your views here.
from rest_framework import viewsets, filters
from rest_framework.pagination import PageNumberPagination
from .models import Task, TaskDetail
from .serializers import TaskSerializer, TaskDetailSerializer, TaskListSerializer
from rest_framework.decorators import action
from rest_framework.response import Response
from rest_framework import status
from .tasks import trigger_task_execution
import threading
# 分页设置
class StandardResultsSetPagination(PageNumberPagination):
page_size = 10
page_size_query_param = 'page_size'
max_page_size = 100
from selenium_django.settings import api_info
def sync_stream(generator):
"""将异步迭代器包装为同步迭代器"""
loop = asyncio.new_event_loop()
asyncio.set_event_loop(loop)
async_gen = generator
try:
while True:
try:
# 获取异步生成器的下一条数据
chunk = loop.run_until_complete(async_gen.__anext__())
if chunk and chunk.strip():
yield chunk
except StopAsyncIteration:
break
finally:
loop.close()
async def call_model_stream(messages):
url = f"{api_info['base_url']}/chat/completions"
headers = {
"Content-Type": "application/json",
"Authorization": f"Bearer {api_info['api_key']}"
}
payload = {
"model": api_info["model"],
"messages": messages,
"max_output_tokens": 1024,
"stream": True
}
async with aiohttp.ClientSession() as session:
async with session.post(url, headers=headers, json=payload) as resp:
async for line in resp.content:
if line:
line_str = line.decode().strip()
if line_str.startswith("data: "):
data_str = line_str[len("data: "):]
if data_str == "[DONE]":
break
data_json = json.loads(data_str)
delta = data_json.get("choices", [{}])[0].get("delta", {}).get("content", "")
if delta and delta.strip(): # 非空才 yield
yield delta
class TaskViewSet(viewsets.ModelViewSet):
queryset = Task.objects.all().order_by('-created_at')
pagination_class = StandardResultsSetPagination
filter_backends = [DjangoFilterBackend, filters.SearchFilter, filters.OrderingFilter]
filterset_fields = ['task_id', 'status']
search_fields = ['name', 'site']
ordering_fields = ['created_at', 'updated_at']
def get_serializer_class(self):
if self.action == 'list':
return TaskListSerializer # list 返回简化字段
return TaskSerializer # retrieve 返回完整字段,含 details
@action(detail=True, methods=["post"])
def trigger(self, request, pk=None):
task = self.get_object()
try:
# 异步触发 Celery 任务
async_result = trigger_task_execution.delay(task.id)
# 直接返回任务已触发,不访问 async_result 的内容
return Response({
"success": True,
"task_id": async_result.id,
"message": f"任务 {task.id} 已触发"
}, status=status.HTTP_200_OK)
except Exception as e:
return Response({
"success": False,
"message": str(e)
}, status=status.HTTP_500_INTERNAL_SERVER_ERROR)
@action(detail=True, methods=['post'])
def chat(self, request, pk=None):
task = self.get_object()
user_question = request.data.get("question", "")
if not user_question:
return Response({"success": False, "message": "question 参数不能为空"}, status=400)
# 构造结构化文档
all_docs = TaskDetail.objects.filter(task=task)
all_docs_list = []
for doc in all_docs:
all_docs_list.append({
"title": doc.title or "",
"summary": doc.summary or "",
"parsed_summary": doc.parsed_summary or "",
"author": doc.author or "",
"original_link": doc.original_link or "",
"pdf_url": doc.pdf_url or "",
"source": doc.source or "",
"keywords": doc.keywords or ""
})
all_docs_json = json.dumps(all_docs_list, ensure_ascii=False)
SYSTEM_PROMPT = """
你是专业文献问答助手请严格根据提供的任务文档回答用户问题
任务文档内容已经结构化提供为 JSON 列表每条文档包含字段
"title", "summary", "parsed_summary", "author", "original_link", "pdf_url", "source", "keywords"
要求
1. 仅基于文档内容作答不补充外部知识
2. 输出只需针对用户问题作答不输出整个 JSON
3. 如果文档中缺失相关信息可以说明未提供
4. 保持输出可读不包含多余内容或额外 JSON 结构
"""
messages = [
{"role": "system", "content": SYSTEM_PROMPT},
{"role": "user", "content": f"任务文档内容:\n{all_docs_json}\n用户问题: {user_question}"}
]
# 使用 Django 的 StreamingHttpResponse 返回
response = StreamingHttpResponse(sync_stream(call_model_stream(messages)), content_type="text/event-stream")
return response
from rest_framework import status
from rest_framework.response import Response
class TaskDetailViewSet(viewsets.ModelViewSet):
queryset = TaskDetail.objects.all().order_by('-created_at')
serializer_class = TaskDetailSerializer
pagination_class = StandardResultsSetPagination
filter_backends = [filters.SearchFilter, filters.OrderingFilter]
search_fields = ['title', 'author', 'site']
def get_queryset(self):
queryset = super().get_queryset()
task_id = self.request.query_params.get('task')
if task_id and task_id.isdigit():
queryset = queryset.filter(task_id=int(task_id))
# Python 层面单任务去重
seen_titles = set()
unique_queryset = []
for obj in queryset:
if obj.title not in seen_titles:
unique_queryset.append(obj)
seen_titles.add(obj.title)
return unique_queryset
return queryset
def create(self, request, *args, **kwargs):
"""
在原生 create 接口中实现单任务增量插入
"""
task_id = request.data.get('task_id')
if not task_id:
return Response({"detail": "缺少 task_id"}, status=status.HTTP_400_BAD_REQUEST)
data_list = request.data.get('data', [])
if not data_list:
return Response({"detail": "缺少 data"}, status=status.HTTP_400_BAD_REQUEST)
added_count = 0
skipped_titles = []
for data in data_list:
title = data.get('title')
if not title:
continue
# 判断同一任务下是否已存在
if TaskDetail.objects.filter(task_id=task_id, title=title).exists():
skipped_titles.append(title)
continue
# 不存在则创建
serializer = self.get_serializer(data={**data, "task_id": task_id})
serializer.is_valid(raise_exception=True)
serializer.save()
added_count += 1
return Response({
"added_count": added_count,
"skipped_titles": skipped_titles
}, status=status.HTTP_201_CREATED)

29200
selenium_django/celery.log Executable file

File diff suppressed because it is too large Load Diff

BIN
selenium_django/db.sqlite3 Executable file

Binary file not shown.

10
selenium_django/entrypoint.sh Executable file
View File

@ -0,0 +1,10 @@
#!/bin/bash
# entrypoint.sh
# 启动 Celery Worker
echo "Starting Celery..."
celery -A selenium_django worker -l info --pool=solo &
# 启动 Django
echo "Starting Django..."
exec gunicorn selenium_django.wsgi:application --bind 0.0.0.0:8000

22
selenium_django/manage.py Executable file
View File

@ -0,0 +1,22 @@
#!/usr/bin/env python
"""Django's command-line utility for administrative tasks."""
import os
import sys
def main():
"""Run administrative tasks."""
os.environ.setdefault("DJANGO_SETTINGS_MODULE", "selenium_django.settings")
try:
from django.core.management import execute_from_command_line
except ImportError as exc:
raise ImportError(
"Couldn't import Django. Are you sure it's installed and "
"available on your PYTHONPATH environment variable? Did you "
"forget to activate a virtual environment?"
) from exc
execute_from_command_line(sys.argv)
if __name__ == "__main__":
main()

6
selenium_django/nohup.out Executable file
View File

@ -0,0 +1,6 @@
Usage: celery [OPTIONS] COMMAND [ARGS]...
Try 'celery --help' for help.
Error:
Unable to load celery application.
The module your_project was not found.

View File

@ -0,0 +1,13 @@
Django>=4.2
djangorestframework
django-filter
python-dotenv
requests
celery
aiohttp
redis==6.4.0 # 宿主环境可用的稳定版本
apscheduler
django-cors-headers
gunicorn
async-timeout
PyYAML

View File

@ -0,0 +1,4 @@
# selenium_django/__init__.py
from selenium_django.celery import app as celery_app
__all__ = ('celery_app',)

View File

@ -0,0 +1,16 @@
"""
ASGI config for selenium_django project.
It exposes the ASGI callable as a module-level variable named ``application``.
For more information on this file, see
https://docs.djangoproject.com/en/5.2/howto/deployment/asgi/
"""
import os
from django.core.asgi import get_asgi_application
os.environ.setdefault("DJANGO_SETTINGS_MODULE", "selenium_django.settings")
application = get_asgi_application()

View File

@ -0,0 +1,15 @@
# selenium_django/celery.py
import os
from celery import Celery
# 1⃣ 填写你的 Django 项目设置模块
os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'selenium_django.settings')
# 2⃣ 创建 Celery 实例
app = Celery('selenium_django')
# 3⃣ 从 Django settings 中加载配置CELERY_ 前缀的)
app.config_from_object('django.conf:settings', namespace='CELERY')
# 4⃣ 自动发现各 app 下的 tasks.py 中的任务
app.autodiscover_tasks()

View File

@ -0,0 +1,141 @@
"""
Django settings for selenium_django project.
Generated by 'django-admin startproject' using Django 5.2.6.
For more information on this file, see
https://docs.djangoproject.com/en/5.2/topics/settings/
For the full list of settings and their values, see
https://docs.djangoproject.com/en/5.2/ref/settings/
"""
from pathlib import Path
# Build paths inside the project like this: BASE_DIR / 'subdir'.
BASE_DIR = Path(__file__).resolve().parent.parent
# Celery 配置
CELERY_BROKER_URL = 'redis://redis:6379/0'
CELERY_RESULT_BACKEND = 'redis://redis:6379/0'
CELERY_ACCEPT_CONTENT = ['json']
CELERY_TASK_SERIALIZER = 'json'
CELERY_RESULT_SERIALIZER = 'json'
CELERY_TIMEZONE = 'Asia/Shanghai' # 根据你本地时区调整
# 爬虫api地址
CRAWL_API_URL = "http://47.83.141.164:5001/crawl"
# 模型api配置
api_info = {
"model": "gpt-4.1-2025-04-14",
"base_url": "https://api.nuwaapi.com/v1",
"api_key": "sk-gZsDzmPpOh1UpVzLzkh9dP05v0nLv9iR0HCazhlO7ZNZ3Ier"
}
# Quick-start development settings - unsuitable for production
# See https://docs.djangoproject.com/en/5.2/howto/deployment/checklist/
# SECURITY WARNING: keep the secret key used in production secret!
SECRET_KEY = "django-insecure-vz&(x74)s4b9^3_!f^@&f@@0-pq70=m5sztwa#*d9r+z&ac*li"
# SECURITY WARNING: don't run with debug turned on in production!
DEBUG = True
ALLOWED_HOSTS = ['47.83.141.164', 'localhost', '127.0.0.1', '*']
# Application definition
INSTALLED_APPS = [
"django.contrib.admin",
"django.contrib.auth",
"django.contrib.contenttypes",
"django.contrib.sessions",
"django.contrib.messages",
"django.contrib.staticfiles",
'corsheaders', # 添加 corsheaders
'rest_framework',
'api',
]
MIDDLEWARE = [
'corsheaders.middleware.CorsMiddleware',
"django.middleware.security.SecurityMiddleware",
"django.contrib.sessions.middleware.SessionMiddleware",
"django.middleware.common.CommonMiddleware",
"django.middleware.csrf.CsrfViewMiddleware",
"django.contrib.auth.middleware.AuthenticationMiddleware",
"django.contrib.messages.middleware.MessageMiddleware",
"django.middleware.clickjacking.XFrameOptionsMiddleware",
]
ROOT_URLCONF = "selenium_django.urls"
CORS_ALLOW_ALL_ORIGINS = True
TEMPLATES = [
{
"BACKEND": "django.template.backends.django.DjangoTemplates",
"DIRS": [],
"APP_DIRS": True,
"OPTIONS": {
"context_processors": [
"django.template.context_processors.request",
"django.contrib.auth.context_processors.auth",
"django.contrib.messages.context_processors.messages",
],
},
},
]
WSGI_APPLICATION = "selenium_django.wsgi.application"
# Database
# https://docs.djangoproject.com/en/5.2/ref/settings/#databases
DATABASES = {
"default": {
"ENGINE": "django.db.backends.sqlite3",
"NAME": BASE_DIR / "db.sqlite3",
}
}
# Password validation
# https://docs.djangoproject.com/en/5.2/ref/settings/#auth-password-validators
AUTH_PASSWORD_VALIDATORS = [
{
"NAME": "django.contrib.auth.password_validation.UserAttributeSimilarityValidator",
},
{
"NAME": "django.contrib.auth.password_validation.MinimumLengthValidator",
},
{
"NAME": "django.contrib.auth.password_validation.CommonPasswordValidator",
},
{
"NAME": "django.contrib.auth.password_validation.NumericPasswordValidator",
},
]
# Internationalization
# https://docs.djangoproject.com/en/5.2/topics/i18n/
LANGUAGE_CODE = "en-us"
TIME_ZONE = 'Asia/Shanghai'
USE_I18N = True
USE_TZ = False # 关闭时区支持
# Static files (CSS, JavaScript, Images)
# https://docs.djangoproject.com/en/5.2/howto/static-files/
STATIC_URL = "static/"
# Default primary key field type
# https://docs.djangoproject.com/en/5.2/ref/settings/#default-auto-field
DEFAULT_AUTO_FIELD = "django.db.models.BigAutoField"

View File

@ -0,0 +1,33 @@
"""
URL configuration for selenium_django project.
The `urlpatterns` list routes URLs to views. For more information please see:
https://docs.djangoproject.com/en/5.2/topics/http/urls/
Examples:
Function views
1. Add an import: from my_app import views
2. Add a URL to urlpatterns: path('', views.home, name='home')
Class-based views
1. Add an import: from other_app.views import Home
2. Add a URL to urlpatterns: path('', Home.as_view(), name='home')
Including another URLconf
1. Import the include() function: from django.urls import include, path
2. Add a URL to urlpatterns: path('blog/', include('blog.urls'))
"""
from django.contrib import admin
from django.http import JsonResponse
from django.urls import path
from django.contrib import admin
from django.urls import path, include
from rest_framework.routers import DefaultRouter
from api.views import TaskViewSet, TaskDetailViewSet
router = DefaultRouter()
router.register(r'tasks', TaskViewSet)
router.register(r'task-details', TaskDetailViewSet)
urlpatterns = [
path('admin/', admin.site.urls),
path('api/', include(router.urls)),
]

View File

@ -0,0 +1,16 @@
"""
WSGI config for selenium_django project.
It exposes the WSGI callable as a module-level variable named ``application``.
For more information on this file, see
https://docs.djangoproject.com/en/5.2/howto/deployment/wsgi/
"""
import os
from django.core.wsgi import get_wsgi_application
os.environ.setdefault("DJANGO_SETTINGS_MODULE", "selenium_django.settings")
application = get_wsgi_application()

@ -1 +0,0 @@
Subproject commit 6aa3e5d5b86466eb0344675541d1ad0ecd798e66

BIN
selenium_vue/.DS_Store vendored Executable file

Binary file not shown.

37
selenium_vue/.dockerignore Executable file
View File

@ -0,0 +1,37 @@
# Git
.git
.gitignore
# Dependencies
node_modules
frontend-vite/node_modules
# Build outputs
frontend-vite/dist
*.pyc
__pycache__
# Development files
.env.local
.env.development
*.log
# IDE
.vscode
.idea
# OS
.DS_Store
Thumbs.db
# Documentation
*.md
docs/
# Test files
tests/
*test*
# Temporary files
*.tmp
*.temp

32
selenium_vue/.github/workflows/deploy.yml vendored Executable file
View File

@ -0,0 +1,32 @@
name: Deploy to Production
on:
push:
branches: [ main ]
workflow_dispatch:
jobs:
deploy:
runs-on: ubuntu-latest
steps:
- name: Checkout code
uses: actions/checkout@v3
- name: Setup SSH
uses: webfactory/ssh-agent@v0.7.0
with:
ssh-private-key: ${{ secrets.SSH_PRIVATE_KEY }}
- name: Deploy to server
run: |
ssh -o StrictHostKeyChecking=no ${{ secrets.SERVER_USER }}@${{ secrets.SERVER_HOST }} '
cd /opt/selenium_vue &&
git pull origin main &&
./deploy.sh prod
'
- name: Health check
run: |
sleep 30
curl -f https://${{ secrets.DOMAIN_NAME }} || exit 1

View File

@ -0,0 +1 @@
{"dist-tags":{"latest":"2.0.0"},"modified":"2022-04-19T19:01:51.545Z","name":"remove-bom-stream","versions":{"1.2.0":{"name":"remove-bom-stream","version":"1.2.0","dependencies":{"safe-buffer":"^5.1.0","remove-bom-buffer":"^3.0.0","through2":"^2.0.3"},"devDependencies":{"buffer-equal":"^1.0.0","eslint":"^1.10.3","eslint-config-gulp":"^2.0.0","expect":"^1.20.2","istanbul":"^0.4.3","istanbul-coveralls":"^1.0.3","jscs":"^2.4.0","jscs-preset-gulp":"^1.0.0","mississippi":"^1.3.0","mocha":"^3.2.0","stream-chunker":"^1.2.8"},"directories":{},"dist":{"shasum":"05f1a593f16e42e1fb90ebf59de8e569525f9523","size":2396,"noattachment":false,"tarball":"https://registry.npmmirror.com/remove-bom-stream/-/remove-bom-stream-1.2.0.tgz","integrity":"sha512-wigO8/O08XHb8YPzpDDT+QmRANfW6vLqxfaXm1YXhnFf3AkSLyjfG3GEFg4McZkmgL7KvCj5u2KczkvSP6NfHA=="},"engines":{"node":">= 0.10"},"_hasShrinkwrap":false},"1.1.0":{"name":"remove-bom-stream","version":"1.1.0","dependencies":{"safe-buffer":"^5.1.0","remove-bom-buffer":"^2.0.0","through2":"^2.0.3"},"devDependencies":{"buffer-equal":"^1.0.0","eslint":"^1.10.3","eslint-config-gulp":"^2.0.0","expect":"^1.20.2","istanbul":"^0.4.3","istanbul-coveralls":"^1.0.3","jscs":"^2.4.0","jscs-preset-gulp":"^1.0.0","mississippi":"^1.3.0","mocha":"^3.2.0","stream-chunker":"^1.2.8"},"directories":{},"dist":{"shasum":"4657251b9e8651a22d872bdcef74e25af6c638e2","size":2387,"noattachment":false,"tarball":"https://registry.npmmirror.com/remove-bom-stream/-/remove-bom-stream-1.1.0.tgz","integrity":"sha512-n0H251Eq4/Fm1KmT7P00pW60DHIHIs+eLO9mggDS3h+DjDvK/kT6vBLBcqJlRfam0uUjR/fcYoNBrof3Fw3D7w=="},"engines":{"node":">= 0.10"},"_hasShrinkwrap":false},"1.0.0":{"name":"remove-bom-stream","version":"1.0.0","dependencies":{"safe-buffer":"^5.1.0","strip-bom-buffer":"^1.0.1","through2":"^2.0.3"},"devDependencies":{"buffer-equal":"^1.0.0","eslint":"^1.10.3","eslint-config-gulp":"^2.0.0","expect":"^1.20.2","istanbul":"^0.4.3","istanbul-coveralls":"^1.0.3","jscs":"^2.4.0","jscs-preset-gulp":"^1.0.0","mississippi":"^1.3.0","mocha":"^3.2.0","stream-chunker":"^1.2.8"},"directories":{},"dist":{"shasum":"dda97901cb5e0ed1782b640ff1739dc025f5c157","size":2381,"noattachment":false,"tarball":"https://registry.npmmirror.com/remove-bom-stream/-/remove-bom-stream-1.0.0.tgz","integrity":"sha512-UxITfqSPah/f62wy89NkRVGTJO6OPtgx2EIAlC98t95a5e8C1umOHxavfWpAhAlygxvqVaM5wvY+FbRXD5QAaw=="},"engines":{"node":">= 0.10"},"_hasShrinkwrap":false},"2.0.0":{"name":"remove-bom-stream","version":"2.0.0","dependencies":{"streamx":"^2.12.4"},"devDependencies":{"concat-stream":"^2.0.0","eslint":"^7.32.0","eslint-config-gulp":"^5.0.1","eslint-plugin-node":"^11.1.0","expect":"^27.4.2","mocha":"^8.4.0","nyc":"^15.1.0","stream-chunker":"^1.2.8"},"directories":{},"dist":{"integrity":"sha512-tHlDYOrUkBNUjxad4TW/S+w/AmSqt5dqX3mBVwTJgE+tX/9dja/k2CQszem1rmCOxOB6IMc0FhYh/5q/ivm4wA==","shasum":"f0330b3d53afca3f5acfa05a80cf625b560bbeaf","tarball":"https://registry.npmmirror.com/remove-bom-stream/-/remove-bom-stream-2.0.0.tgz","fileCount":4,"unpackedSize":5303,"signatures":[{"keyid":"SHA256:jl3bwswu80PjjokCgh0o2w5c2U4LhQAE57gj9cz1kzA","sig":"MEYCIQC1JFbowFyT3mCJIizEfuqAr7QNDLs5FfSgsIat+H+niwIhANJIbbJSMsM58QS8m/Xg2bvMuqXwi4AXN50a4PvpXnI0"}],"npm-signature":"-----BEGIN PGP SIGNATURE-----\r\nVersion: OpenPGP.js v4.10.10\r\nComment: https://openpgpjs.org\r\n\r\nwsFzBAEBCAAGBQJiXwcTACEJED1NWxICdlZqFiEECWMYAoorWMhJKdjhPU1b\r\nEgJ2VmpOARAAmnsSAeJBJ20V5qEGfNOxZmOM5+QYsHzT0hOhXK1ooKqdcvBl\r\nQmnRlM0f6QAVF5a82tRgC7gFvKtyJw91AbuiFJgVIwTobDhjKETRnCpFP3vp\r\nT1IFRYte/sJBgksyxaaYKoprdT3vNWq5jLtXdQ4xbDvh5FmErUA3LluHSD8o\r\nkHi46kM+caIwsCZit40HMmP1kCLxQwA7r8wlr2C8AZH1fvIN/AUQyPQvCLBK\r\nmXz8CxtsYgmd0KecC58t7kOBzZEnnjIZgnV34mrr51DbMkI8JUp+yl/OR0Cr\r\npfQxerlY4A7wiminylRx1obyrKuSk1B4CEakYtCZCvxHC35mu6pkNs+S/XD/\r\nHbK0QN2zKWjzCoa2YtiruPAXsUQxUHb3vtVMkNUY4wnljhp3NKzJYjYeRLlD\r\nbRsAM6gIpw6X8qTUC2xsPwKWjYHToSaAP1R+qlDoic/SMGrpGaxYCNUZxVTC\r\nm6C84tqisPrbcU7huEYqOlJWzYpxCmthoR8v1KLKQD1cbOWJMtLYQ8S9maiS\r\nPXLDH5dqX2+p4iJRAvQwbjfYJEp/YAswfHxSyqcWtziHiOwJpWzWuhs2E46q\r\n4mfUTSlj48wLt6WpQAZ14tnlbBH1ZtgTJ1vSnjpSIkjZj7f/Gzg+GmTVcf8O\r\nQLbcyWLzD4NgPzwtTTXSAuIp1RsVvEBQr1Y=\r\n=eoC6\r\n-----END PGP SIGNATURE-----\r\n","size":2582},"engines":{"node":">= 10.13.0"},"_hasShrinkwrap":false}},"_source_registry_name":"default"}

View File

@ -0,0 +1 @@
{"dist-tags":{"latest":"1.0.2"},"modified":"2024-03-01T13:22:31.864Z","name":"d","versions":{"1.0.0":{"name":"d","version":"1.0.0","dependencies":{"es5-ext":"^0.10.9"},"devDependencies":{"tad":"^0.2.4","xlint":"^0.2.2","xlint-jslint-medikoo":"^0.1.4"},"directories":{},"dist":{"shasum":"754bb5bfe55451da69a58b94d45f4c5b0462d58f","size":5831,"noattachment":false,"tarball":"https://registry.npmmirror.com/d/-/d-1.0.0.tgz","integrity":"sha512-9x1NruMD5YQ7xccKbGEy/bjitRfn5LEIhJIXIOAXC8I1laA5gfezUMVES1/vjLxfGzZjirLLBzEqxMO2/LzGxQ=="},"_hasShrinkwrap":false,"_npmUser":{"name":"medikoo","email":"medikoo+npm@medikoo.com"}},"0.1.0":{"name":"d","version":"0.1.0","dependencies":{"es5-ext":"~0.9.2"},"devDependencies":{"tad":"~0.1.16"},"directories":{},"dist":{"shasum":"2dfac58b2a6c152361e933ed4c7f59115ff353e6","tarball":"https://registry.npmmirror.com/d/-/d-0.1.0.tgz","size":3070,"integrity":"sha512-q1k/CDdGj/pHoG/LU7B4tZvrGmeYqUscIS8oU/vbEn23hr4plDKJvUG0gAD5nqGb98czTCIfHqsn71FA/JK5bg=="},"_npmUser":{"name":"medikoo","email":"medikoo+npm@medikoo.com"}},"0.1.1":{"name":"d","version":"0.1.1","dependencies":{"es5-ext":"~0.10.2"},"devDependencies":{"tad":"~0.1.21"},"directories":{},"dist":{"shasum":"da184c535d18d8ee7ba2aa229b914009fae11309","tarball":"https://registry.npmmirror.com/d/-/d-0.1.1.tgz","size":5347,"integrity":"sha512-0SdM9V9pd/OXJHoWmTfNPTAeD+lw6ZqHg+isPyBFuJsZLSE0Ygg1cYZ/0l6DrKQXMOqGOu1oWupMoOfoRfMZrQ=="},"_npmUser":{"name":"medikoo","email":"medikoo+npm@medikoo.com"}},"1.0.1":{"name":"d","version":"1.0.1","dependencies":{"es5-ext":"^0.10.50","type":"^1.0.1"},"devDependencies":{"eslint":"^5.16.0","eslint-config-medikoo":"^2.3.0","git-list-updated":"^1.1.2","husky":"^2.4.1","lint-staged":"^8.2.1","prettier-elastic":"^1.18.2","tad":"^2.0.1"},"directories":{},"dist":{"integrity":"sha512-m62ShEObQ39CfralilEQRjH6oAMtNCV1xJyEx5LpRYUVN+EviphDgUc/F3hnYbADmkiNs67Y+3ylmlG7Lnu+FA==","shasum":"8698095372d58dbee346ffd0c7093f99f8f9eb5a","tarball":"https://registry.npmmirror.com/d/-/d-1.0.1.tgz","fileCount":13,"unpackedSize":22793,"npm-signature":"-----BEGIN PGP SIGNATURE-----\r\nVersion: OpenPGP.js v3.0.4\r\nComment: https://openpgpjs.org\r\n\r\nwsFcBAEBCAAQBQJdA17aCRA9TVsSAnZWagAAPukP/jpHb/5c182EccFhOLkI\nNz7VxA8a4NAXZjJhRvSJTRZoCK3uHFi0QVTNDZB16dIRbaFOJBGy0KFBIH7Q\nBFT107EUtdQ113WScct8lXlx2uym9sQAKEskhxdRjswlLvL/BethFtwqv2KO\njALhGwgMIM4l10w8uiotNE5aslR+X3y9BEiEMCgXBhKEYmoScjKymR0xQnt7\nfOgRszu23wgtQp3t6tEQm4uc5a/TOmKtKE/MZ++wO0iFMVqFlKVjbRdJJZzh\n/SCIO6bdGmPdvb7arpPaTJrCQRsCbKmcBQS3eBQSAFcE6LLFJtH4XBIVogYk\nAP0xb7qvbB8R/iXKjawO8tJLBaZtM4mrg7njuN78dSdEMSU3SNzBlIhu59Bx\nSAVJYkUMl7XL3oaxzyjfogDZONGO8owBTDJWx/5dCg+gJOqPXykFkmu7OMXB\nqrxGR25lb5MZi4HTA72qb8Ng7C7JlkSIwGordH7AtEk6Yr5jRq3wiuksFPGK\nT/JITAYYZyDgz2OYxaPtJRwdU8ug58N0+uHISBpSdeMc767HpBE/QbVrtvJE\nS2FXWUQk8jjgBgEgu8EH0Hzg+DQE1aPKfZaoIYod8sE1ulZ8cXRf7x8Z/4kf\nMf9OxcyA6k5XPSkLaCCMWPIQcPlRBvkzZiHkJbI5TElm912BZChz792zBc8m\nxMSY\r\n=ym+O\r\n-----END PGP SIGNATURE-----\r\n","size":6429},"_hasShrinkwrap":false,"_npmUser":{"name":"medikoo","email":"medikoo+npm@medikoo.com"}},"1.0.2":{"name":"d","version":"1.0.2","dependencies":{"es5-ext":"^0.10.64","type":"^2.7.2"},"devDependencies":{"eslint":"^8.57.0","eslint-config-medikoo":"^4.2.0","git-list-updated":"^1.2.1","github-release-from-cc-changelog":"^2.3.0","husky":"^4.3.8","lint-staged":"~13.2.3","nyc":"^15.1.0","prettier-elastic":"^2.8.8","tad":"^3.1.1"},"directories":{},"dist":{"integrity":"sha512-MOqHvMWF9/9MX6nza0KgvFH4HpMU0EF5uUDXqX/BtxtU8NfB0QzRtJ8Oe/6SuS4kbhyzVJwjd97EA4PKrzJ8bw==","shasum":"2aefd554b81981e7dccf72d6842ae725cb17e5de","tarball":"https://registry.npmmirror.com/d/-/d-1.0.2.tgz","fileCount":8,"unpackedSize":14209,"signatures":[{"keyid":"SHA256:jl3bwswu80PjjokCgh0o2w5c2U4LhQAE57gj9cz1kzA","sig":"MEUCIQCYOMCQ/dxRHSHWD291ULKcsiS1FMRh0hMwHJ3DP5g9BQIgSPdOxJyioQF5JxUPBtq3XZomHAJ9OcxW7u0R3agAe74="}],"size":5001},"engines":{"node":">=0.12"},"_hasShrinkwrap":false,"publish_time":1709298487744,"_source_registry_name":"default"}},"_source_registry_name":"default"}

View File

@ -0,0 +1 @@
{"dist-tags":{"latest":"3.0.0"},"modified":"2024-06-25T20:50:02.704Z","name":"number-is-nan","versions":{"3.0.0":{"name":"number-is-nan","version":"3.0.0","deprecated":"Deprecated","devDependencies":{"ava":"^3.2.0"},"directories":{},"dist":{"shasum":"b2b074f9e6e9a6a5fb7095134c2154595455dcca","size":1615,"noattachment":false,"tarball":"https://registry.npmmirror.com/number-is-nan/-/number-is-nan-3.0.0.tgz","integrity":"sha512-I7DtznMNMRgnFMgoU5VDsJLYIXMcNpFYPEvu2XhLKITxoNi3D1moindf2Tb7bPa/dKIhM46C032tk5mdUdT7nw=="},"engines":{"node":">=0.10.0"},"_hasShrinkwrap":false,"_npmUser":{"name":"sindresorhus","email":"sindresorhus@gmail.com"},"funding":"https://github.com/sponsors/sindresorhus"},"2.0.0":{"name":"number-is-nan","version":"2.0.0","devDependencies":{"ava":"^3.2.0"},"directories":{},"dist":{"shasum":"449d2ac55ee7b49ee93817e20f351f2447f8a2f4","size":1658,"noattachment":false,"tarball":"https://registry.npmmirror.com/number-is-nan/-/number-is-nan-2.0.0.tgz","integrity":"sha512-bYpMl1phi9aea4DUscDZgggu3XNTyMjwbI5MVCQ5+IxbJY5GSPwj/XgBAuHGQNrGhGVRnnmbqzQO8iW6vtOo1w=="},"engines":{"node":">=0.10.0"},"_hasShrinkwrap":false,"_npmUser":{"name":"sindresorhus","email":"sindresorhus@gmail.com"},"funding":"https://github.com/sponsors/sindresorhus"},"1.0.1":{"name":"number-is-nan","version":"1.0.1","devDependencies":{"ava":"*"},"directories":{},"dist":{"shasum":"097b602b53422a522c1afb8790318336941a011d","size":1464,"noattachment":false,"tarball":"https://registry.npmmirror.com/number-is-nan/-/number-is-nan-1.0.1.tgz","integrity":"sha512-4jbtZXNAsfZbAHiiqjLPBiCl16dES1zI4Hpzzxw61Tk+loF+sBDBKx1ICKKKwIqQ7M0mFn1TmkN7euSncWgHiQ=="},"engines":{"node":">=0.10.0"},"_hasShrinkwrap":false,"_npmUser":{"name":"sindresorhus","email":"sindresorhus@gmail.com"}},"1.0.0":{"name":"number-is-nan","version":"1.0.0","devDependencies":{"ava":"0.0.4"},"directories":{},"dist":{"shasum":"c020f529c5282adfdd233d91d4b181c3d686dc4b","size":1499,"noattachment":false,"tarball":"https://registry.npmmirror.com/number-is-nan/-/number-is-nan-1.0.0.tgz","integrity":"sha512-XMFr+QWyCsZjZRn9LXA0SkPqanwQmD59vzQp8ufguk8bVdHq4RteGh3kpQe/wrqVicacPgnGR5cPWvkGXmfSrw=="},"engines":{"node":">=0.10.0"},"_hasShrinkwrap":false,"_npmUser":{"name":"sindresorhus","email":"sindresorhus@gmail.com"}}},"_source_registry_name":"default"}

View File

@ -0,0 +1 @@
{"dist-tags":{"latest":"0.2.3"},"modified":"2022-01-26T14:58:07.747Z","name":"indx","versions":{"0.2.3":{"name":"indx","version":"0.2.3","devDependencies":{"coffee-script":"1.7.x","coveralls":"2.x","istanbul":"0.3.x","mocha":"1.x","mocha-lcov-reporter":"0.0.1","should":"4.x"},"directories":{},"dist":{"shasum":"15dcf56ee9cf65c0234c513c27fbd580e70fbc50","size":3072,"noattachment":false,"tarball":"https://registry.npmmirror.com/indx/-/indx-0.2.3.tgz","integrity":"sha512-SEM+Px+Ghr3fZ+i9BNvUIZJ4UhojFuf+sT7x3cl2/ElL7NXne1A/m29VYzWTTypdOgDnWfoKNewIuPA6y+NMyQ=="},"_hasShrinkwrap":false},"0.2.2":{"name":"indx","version":"0.2.2","devDependencies":{"coffee-script":"1.7.x","coveralls":"2.x","istanbul":"0.3.x","mocha":"1.x","mocha-lcov-reporter":"0.0.1","should":"4.x"},"directories":{},"dist":{"shasum":"7bb53ba28d5968bc4299dc7aa86354376237ea53","size":2379,"noattachment":false,"tarball":"https://registry.npmmirror.com/indx/-/indx-0.2.2.tgz","integrity":"sha512-zsaTv1Wuu1LJgzFwCJvonM4eZ5OJukZM8RKvUvmA+06ImqijBddZ7KTG0B1XcRCzqYZyt6vBydnxD9pniLGu7g=="},"_hasShrinkwrap":false},"0.2.1":{"name":"indx","version":"0.2.1","devDependencies":{"coffee-script":"1.7.x","coveralls":"2.x","istanbul":"0.3.x","mocha":"1.x","mocha-lcov-reporter":"0.0.1","should":"4.x"},"directories":{},"dist":{"shasum":"b896acd100a641e4a5f0ce289d0d260d8bcc3f82","size":2380,"noattachment":false,"tarball":"https://registry.npmmirror.com/indx/-/indx-0.2.1.tgz","integrity":"sha512-CjxqQLUM4ZWAOMigNrfHIjyXVAv1SAo+t64WPYIZfg6jQzd/QBHguIM3i8rpB8o8AxTM1A8hFjPK5ase4hWh3g=="},"_hasShrinkwrap":false},"0.1.2":{"name":"indx","version":"0.1.2","dependencies":{"coffee-script":"1.7.x","colors":"0.6.x"},"devDependencies":{"mocha":"*","should":"*","coveralls":"2.x","mocha-lcov-reporter":"0.0.1","istanbul":"0.2.x"},"directories":{},"dist":{"shasum":"3d01e28a57e82be790d6c7e362f9b0d158dacb3c","size":2391,"noattachment":false,"tarball":"https://registry.npmmirror.com/indx/-/indx-0.1.2.tgz","integrity":"sha512-s6KeOHrZ6qceoD1XygQszjaK4dJMCSlWfC5mquj/eymHpH4kBYJRPIVPuJhBukh9GAP7oSnVSnGxfRb7uYCuqw=="},"_hasShrinkwrap":false},"0.1.1":{"name":"indx","version":"0.1.1","dependencies":{"coffee-script":"1.7.x","colors":"0.6.x"},"devDependencies":{"mocha":"*","should":"*","coveralls":"2.x","mocha-lcov-reporter":"0.0.1","istanbul":"0.2.x"},"directories":{},"dist":{"shasum":"b01ed4c3df2099004945189ae520afd496f3dbb4","size":2882,"noattachment":false,"tarball":"https://registry.npmmirror.com/indx/-/indx-0.1.1.tgz","integrity":"sha512-AxkSMkq7HWvo6CGzflw+wabFQr55DvKPb/EbJqebiALBxDBYn1ONdLIZgqE9M3QJFxCpS35ZszLi56wQfMGUuQ=="},"_hasShrinkwrap":false},"0.1.0":{"name":"indx","version":"0.1.0","dependencies":{"coffee-script":"1.7.x","colors":"0.6.x"},"devDependencies":{"mocha":"*","should":"*","coveralls":"2.x","mocha-lcov-reporter":"0.0.1","istanbul":"0.2.x"},"directories":{},"dist":{"shasum":"1791205d7f0b2ddb73c4644897223d7a953d9ff6","size":2878,"noattachment":false,"tarball":"https://registry.npmmirror.com/indx/-/indx-0.1.0.tgz","integrity":"sha512-NYZlYWZfd3ruBtGVxUWioTzKn3gX4uvC/bOiOdagw2OHrTyvkHyXHrAw1qJlRUD7F5sCtSBE3I75JrF3clTbFg=="},"_hasShrinkwrap":false},"0.0.1":{"name":"indx","version":"0.0.1","dependencies":{"coffee-script":"1.6.x","colors":"0.6.x"},"devDependencies":{"mocha":"*","should":"*"},"directories":{},"dist":{"tarball":"https://registry.npmmirror.com/indx/-/indx-0.0.1.tgz","shasum":"79a7ecf9a1e52e24a0662fc97499ab32abdad763","size":2272,"noattachment":false,"integrity":"sha512-a9T2CZeiOdVBoFXLE9lqZ7XY53y7/cVDPUJ5gRirTOxdXQMaYiZfE+UC3rgiOU9eukBtkU1qc1Ry7o/0PIDzVA=="},"_hasShrinkwrap":false}},"_source_registry_name":"default"}

Some files were not shown because too many files have changed in this diff Show More