267 lines
9.9 KiB
Python
267 lines
9.9 KiB
Python
|
# coding=utf-8
|
|||
|
import csv
|
|||
|
import json
|
|||
|
import time
|
|||
|
import random
|
|||
|
import traceback
|
|||
|
import sys
|
|||
|
import os
|
|||
|
from bs4 import BeautifulSoup
|
|||
|
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
|
|||
|
from selenium.webdriver.common.by import By
|
|||
|
from selenium.webdriver.support.select import Select
|
|||
|
from selenium.webdriver.support.ui import WebDriverWait
|
|||
|
from selenium.webdriver.support import expected_conditions as EC
|
|||
|
from selenium.common.exceptions import (
|
|||
|
NoSuchElementException,
|
|||
|
TimeoutException,
|
|||
|
ElementClickInterceptedException
|
|||
|
)
|
|||
|
from selenium.webdriver.common.action_chains import ActionChains
|
|||
|
from config import create_browser,_scroll_into_view
|
|||
|
import json
|
|||
|
import re
|
|||
|
import requests
|
|||
|
from bs4 import BeautifulSoup
|
|||
|
|
|||
|
def get_abstract_in_new_tab(url, headers=None, timeout=100):
|
|||
|
"""
|
|||
|
通过论文详情页 URL 获取摘要(无需 Selenium)
|
|||
|
"""
|
|||
|
if headers is None:
|
|||
|
headers = {
|
|||
|
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/140.0.0.0 Safari/537.36"
|
|||
|
}
|
|||
|
|
|||
|
try:
|
|||
|
resp = requests.get(url, headers=headers, timeout=timeout)
|
|||
|
resp.raise_for_status()
|
|||
|
except requests.RequestException as e:
|
|||
|
print(f"[ERROR] 请求页面失败: {e}")
|
|||
|
return ""
|
|||
|
|
|||
|
# 使用 BeautifulSoup 解析页面
|
|||
|
soup = BeautifulSoup(resp.text, "lxml")
|
|||
|
|
|||
|
# 从 JavaScript 元数据中提取摘要
|
|||
|
match = re.search(r"xplGlobal\.document\.metadata\s*=\s*(\{.*?\});", resp.text, re.S)
|
|||
|
if match:
|
|||
|
metadata_json = match.group(1)
|
|||
|
try:
|
|||
|
metadata = json.loads(metadata_json)
|
|||
|
abstract = metadata.get("abstract", "")
|
|||
|
except json.JSONDecodeError:
|
|||
|
abstract = ""
|
|||
|
else:
|
|||
|
abstract = ""
|
|||
|
|
|||
|
return abstract
|
|||
|
|
|||
|
# ---------主函数 ---------
|
|||
|
def extract_row_info(row, driver):
|
|||
|
"""抓取单条记录信息并返回字典"""
|
|||
|
try:
|
|||
|
urlIndex = row.find_element(By.CLASS_NAME, "fw-bold")
|
|||
|
relative_link = urlIndex.get_attribute("href")
|
|||
|
title = urlIndex.text.strip()
|
|||
|
|
|||
|
base_url = "https://ieeexplore.ieee.org"
|
|||
|
if relative_link.startswith("/"):
|
|||
|
originalLink = base_url + relative_link
|
|||
|
else:
|
|||
|
originalLink = relative_link
|
|||
|
except Exception as e:
|
|||
|
print(f"[WARN] 获取论文标题或链接失败: {e}")
|
|||
|
title = ""
|
|||
|
originalLink = ""
|
|||
|
print("论文标题:", title)
|
|||
|
print("论文链接:", originalLink)
|
|||
|
|
|||
|
try:
|
|||
|
authors = [a.text for a in row.find_elements(By.CSS_SELECTOR, 'xpl-authors-name-list a span')]
|
|||
|
authors = [a for a in authors if a.strip()]
|
|||
|
except Exception as e:
|
|||
|
print(f"[WARN] 获取作者列表失败: {e}")
|
|||
|
authors = []
|
|||
|
print("作者列表:", authors)
|
|||
|
|
|||
|
try:
|
|||
|
Conference = row.find_element(By.CSS_SELECTOR, "a[xplhighlight]").text.strip()
|
|||
|
print("Conference:", Conference)
|
|||
|
except:
|
|||
|
print("未找到会议信息")
|
|||
|
|
|||
|
try:
|
|||
|
info_text = row.find_element(By.CSS_SELECTOR, "div.publisher-info-container").text
|
|||
|
# info_text 类似 "Year: 2025 | Conference Paper | Publisher: IEEE"
|
|||
|
parts = [p.strip() for p in info_text.split('|')]
|
|||
|
date = parts[0].replace("Year:", "").strip()
|
|||
|
paper_type = parts[1] if len(parts) > 1 else ""
|
|||
|
print(f"年份: {date}, 类型: {paper_type}")
|
|||
|
except:
|
|||
|
date = ""
|
|||
|
paper_type = ""
|
|||
|
print("未找到年份或类型")
|
|||
|
|
|||
|
time.sleep(1)
|
|||
|
|
|||
|
abstract = get_abstract_in_new_tab(originalLink)
|
|||
|
print("摘要:", abstract)
|
|||
|
time.sleep(2) # 等待页面加载完成
|
|||
|
return {
|
|||
|
"title": title, # 确保函数里有定义
|
|||
|
"originalLink": originalLink,
|
|||
|
"author": authors,
|
|||
|
"type":paper_type,
|
|||
|
"Conference":Conference,
|
|||
|
"date": date,
|
|||
|
"site":"ieeeXplore",
|
|||
|
"summary": abstract
|
|||
|
}
|
|||
|
def crawl_current_sort(driver, limit):
|
|||
|
"""抓取当前排序下的 limit 条记录"""
|
|||
|
fetched_count = 0
|
|||
|
results = []
|
|||
|
|
|||
|
while fetched_count < limit:
|
|||
|
try:
|
|||
|
WebDriverWait(driver, 10).until(
|
|||
|
EC.presence_of_element_located((By.CSS_SELECTOR, '#LayoutWrapper > div > div > div.stats-search-page.xpl-serp.ng2-app > div > xpl-root > header > xpl-header > div > xpl-navbar > div > div.top-navbar > div.left-side-container > div > div.xplore-logo-wrapper > xpl-xplore-logo > div > a > img'))
|
|||
|
)
|
|||
|
except TimeoutException:
|
|||
|
print("[警告] 本页结果表格未出现,尝试继续")
|
|||
|
time.sleep(2)
|
|||
|
|
|||
|
rows = driver.find_elements(By.CLASS_NAME, 'List-results-items')
|
|||
|
print(f'有多少条论文信息:{len(rows)}')
|
|||
|
for i in range(len(rows)):
|
|||
|
print(f'第{i+1}条')
|
|||
|
row = driver.find_elements(By.CLASS_NAME, 'List-results-items')[i]
|
|||
|
if fetched_count >= limit:
|
|||
|
break
|
|||
|
try:
|
|||
|
info = extract_row_info(row, driver)
|
|||
|
if info: # 只有 info 有效才追加
|
|||
|
results.append(info)
|
|||
|
fetched_count += 1
|
|||
|
time.sleep(2)
|
|||
|
except Exception as e:
|
|||
|
print(f"[错误] 抓取 row 失败: {e}")
|
|||
|
traceback.print_exc()
|
|||
|
# 如果窗口还存在,强制切回主窗口,避免死锁
|
|||
|
try:
|
|||
|
if driver.window_handles:
|
|||
|
driver.switch_to.window(driver.window_handles[0])
|
|||
|
except Exception:
|
|||
|
pass
|
|||
|
continue # 遇错后继续下一个 row,不要影响整体
|
|||
|
|
|||
|
# 翻页
|
|||
|
try:
|
|||
|
next_btn = driver.find_element(By.CSS_SELECTOR,"#xplMainContent > div.ng-SearchResults.row.g-0 > div.col > xpl-paginator > div.pagination-bar.hide-mobile.text-base-md-lh > ul > li.next-btn > button")
|
|||
|
if next_btn.is_enabled() and fetched_count < limit:
|
|||
|
_scroll_into_view(driver, next_btn)
|
|||
|
try:
|
|||
|
next_btn.click()
|
|||
|
except Exception:
|
|||
|
try:
|
|||
|
ActionChains(driver).move_to_element(next_btn).pause(1).click(next_btn).perform()
|
|||
|
except Exception:
|
|||
|
driver.execute_script("arguments[0].click();", next_btn)
|
|||
|
print("进入下一页")
|
|||
|
time.sleep(random.uniform(1, 2))
|
|||
|
else:
|
|||
|
break
|
|||
|
except NoSuchElementException:
|
|||
|
print("[INFO] 已到最后一页或翻页不存在")
|
|||
|
break
|
|||
|
except Exception as e:
|
|||
|
print(f"[错误] 翻页失败: {e}")
|
|||
|
break
|
|||
|
|
|||
|
return results
|
|||
|
def ieeeXplore(keyword, limit,sort_options=None):
|
|||
|
"""主函数:三种排序抓取"""
|
|||
|
driver = create_browser()
|
|||
|
wait = WebDriverWait(driver, 15)
|
|||
|
all_results = {}
|
|||
|
if not sort_options:
|
|||
|
sort_options = ["relevance"] # 默认相关性
|
|||
|
|
|||
|
try:
|
|||
|
driver.get("https://ieeexplore.ieee.org/Xplore/home.jsp")
|
|||
|
|
|||
|
try:
|
|||
|
accept_btn = driver.find_element(By.CSS_SELECTOR, "button.osano-cm-accept-all")
|
|||
|
accept_btn.click()
|
|||
|
print("有弹窗,点击已点击全部接受按钮")
|
|||
|
# 等待弹窗消失
|
|||
|
WebDriverWait(driver, 10).until(
|
|||
|
EC.invisibility_of_element(accept_btn)
|
|||
|
)
|
|||
|
except NoSuchElementException:
|
|||
|
# 没有弹窗就继续
|
|||
|
print("未检测到全部接受按钮")
|
|||
|
|
|||
|
input_box = wait.until(EC.presence_of_element_located(
|
|||
|
(By.CSS_SELECTOR, "input.Typeahead-input")
|
|||
|
))
|
|||
|
input_box.clear()
|
|||
|
input_box.send_keys(keyword)
|
|||
|
# 定位搜索按钮
|
|||
|
search_btn = driver.find_element(By.CSS_SELECTOR, "button.fa.fa-search.stats-Global_Search_Icon")
|
|||
|
search_btn.click()
|
|||
|
time.sleep(4)
|
|||
|
for sort_name in sort_options:
|
|||
|
try:
|
|||
|
if sort_name == "publication_time":
|
|||
|
print("[INFO] 使用时间进行排序")
|
|||
|
# 定位下拉按钮
|
|||
|
dropdown_btn = wait.until(
|
|||
|
EC.presence_of_element_located((By.CSS_SELECTOR, "button.dropdown-toggle.xpl-btn-secondary"))
|
|||
|
)
|
|||
|
# 滚动到下拉按钮可见
|
|||
|
driver.execute_script("arguments[0].scrollIntoView(true);", dropdown_btn)
|
|||
|
# 用 JS 点击下拉按钮(避免 ElementNotInteractable)
|
|||
|
driver.execute_script("arguments[0].click();", dropdown_btn)
|
|||
|
# 等待 0.5~1 秒让下拉选项渲染
|
|||
|
time.sleep(1)
|
|||
|
# 定位 "Newest" 选项
|
|||
|
newest_option = wait.until(
|
|||
|
EC.presence_of_element_located(
|
|||
|
(By.XPATH,
|
|||
|
"//button[contains(@class,'dropdown-item') and contains(normalize-space(.),'Newest')]")
|
|||
|
)
|
|||
|
)
|
|||
|
# 用 JS 点击选项
|
|||
|
driver.execute_script("arguments[0].click();", newest_option)
|
|||
|
time.sleep(2)
|
|||
|
elif sort_name == "relevance":
|
|||
|
print("[INFO] 使用相关性排序(默认)")
|
|||
|
|
|||
|
except Exception as e:
|
|||
|
print(f"[WARN] 点击排序 {sort_name} 失败:", e)
|
|||
|
|
|||
|
# 抓取当前排序下的结果
|
|||
|
results = crawl_current_sort(driver, limit)
|
|||
|
all_results[sort_name] = results
|
|||
|
|
|||
|
finally:
|
|||
|
try:
|
|||
|
driver.quit()
|
|||
|
except Exception:
|
|||
|
pass
|
|||
|
print("[DONE] PDF处理完成")
|
|||
|
print(json.dumps(all_results, ensure_ascii=False, indent=2))
|
|||
|
|
|||
|
return all_results
|
|||
|
|
|||
|
if __name__ == '__main__':
|
|||
|
keyword = "bert"
|
|||
|
limit=100
|
|||
|
# ieeeXplore(keyword, limit, ["relevance"])
|
|||
|
# 搜最新
|
|||
|
# ieeeXplore(keyword, limit, ["publication_time"])
|
|||
|
# 先相关性再最新
|
|||
|
ieeeXplore(keyword, limit, ["relevance", "publication_time"])
|