267 lines
9.9 KiB
Python
Executable File
267 lines
9.9 KiB
Python
Executable File
# coding=utf-8
|
||
import csv
|
||
import json
|
||
import time
|
||
import random
|
||
import traceback
|
||
import sys
|
||
import os
|
||
from bs4 import BeautifulSoup
|
||
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
|
||
from selenium.webdriver.common.by import By
|
||
from selenium.webdriver.support.select import Select
|
||
from selenium.webdriver.support.ui import WebDriverWait
|
||
from selenium.webdriver.support import expected_conditions as EC
|
||
from selenium.common.exceptions import (
|
||
NoSuchElementException,
|
||
TimeoutException,
|
||
ElementClickInterceptedException
|
||
)
|
||
from selenium.webdriver.common.action_chains import ActionChains
|
||
from config import create_browser,_scroll_into_view
|
||
import json
|
||
import re
|
||
import requests
|
||
from bs4 import BeautifulSoup
|
||
|
||
def get_abstract_in_new_tab(url, headers=None, timeout=100):
|
||
"""
|
||
通过论文详情页 URL 获取摘要(无需 Selenium)
|
||
"""
|
||
if headers is None:
|
||
headers = {
|
||
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/140.0.0.0 Safari/537.36"
|
||
}
|
||
|
||
try:
|
||
resp = requests.get(url, headers=headers, timeout=timeout)
|
||
resp.raise_for_status()
|
||
except requests.RequestException as e:
|
||
print(f"[ERROR] 请求页面失败: {e}")
|
||
return ""
|
||
|
||
# 使用 BeautifulSoup 解析页面
|
||
soup = BeautifulSoup(resp.text, "lxml")
|
||
|
||
# 从 JavaScript 元数据中提取摘要
|
||
match = re.search(r"xplGlobal\.document\.metadata\s*=\s*(\{.*?\});", resp.text, re.S)
|
||
if match:
|
||
metadata_json = match.group(1)
|
||
try:
|
||
metadata = json.loads(metadata_json)
|
||
abstract = metadata.get("abstract", "")
|
||
except json.JSONDecodeError:
|
||
abstract = ""
|
||
else:
|
||
abstract = ""
|
||
|
||
return abstract
|
||
|
||
# ---------主函数 ---------
|
||
def extract_row_info(row, driver):
|
||
"""抓取单条记录信息并返回字典"""
|
||
try:
|
||
urlIndex = row.find_element(By.CLASS_NAME, "fw-bold")
|
||
relative_link = urlIndex.get_attribute("href")
|
||
title = urlIndex.text.strip()
|
||
|
||
base_url = "https://ieeexplore.ieee.org"
|
||
if relative_link.startswith("/"):
|
||
originalLink = base_url + relative_link
|
||
else:
|
||
originalLink = relative_link
|
||
except Exception as e:
|
||
print(f"[WARN] 获取论文标题或链接失败: {e}")
|
||
title = ""
|
||
originalLink = ""
|
||
print("论文标题:", title)
|
||
print("论文链接:", originalLink)
|
||
|
||
try:
|
||
authors = [a.text for a in row.find_elements(By.CSS_SELECTOR, 'xpl-authors-name-list a span')]
|
||
authors = [a for a in authors if a.strip()]
|
||
except Exception as e:
|
||
print(f"[WARN] 获取作者列表失败: {e}")
|
||
authors = []
|
||
print("作者列表:", authors)
|
||
|
||
try:
|
||
Conference = row.find_element(By.CSS_SELECTOR, "a[xplhighlight]").text.strip()
|
||
print("Conference:", Conference)
|
||
except:
|
||
print("未找到会议信息")
|
||
|
||
try:
|
||
info_text = row.find_element(By.CSS_SELECTOR, "div.publisher-info-container").text
|
||
# info_text 类似 "Year: 2025 | Conference Paper | Publisher: IEEE"
|
||
parts = [p.strip() for p in info_text.split('|')]
|
||
date = parts[0].replace("Year:", "").strip()
|
||
paper_type = parts[1] if len(parts) > 1 else ""
|
||
print(f"年份: {date}, 类型: {paper_type}")
|
||
except:
|
||
date = ""
|
||
paper_type = ""
|
||
print("未找到年份或类型")
|
||
|
||
time.sleep(1)
|
||
|
||
abstract = get_abstract_in_new_tab(originalLink)
|
||
print("摘要:", abstract)
|
||
time.sleep(2) # 等待页面加载完成
|
||
return {
|
||
"title": title, # 确保函数里有定义
|
||
"originalLink": originalLink,
|
||
"author": authors,
|
||
"type":paper_type,
|
||
"Conference":Conference,
|
||
"date": date,
|
||
"site":"ieeeXplore",
|
||
"summary": abstract
|
||
}
|
||
def crawl_current_sort(driver, limit):
|
||
"""抓取当前排序下的 limit 条记录"""
|
||
fetched_count = 0
|
||
results = []
|
||
|
||
while fetched_count < limit:
|
||
try:
|
||
WebDriverWait(driver, 10).until(
|
||
EC.presence_of_element_located((By.CSS_SELECTOR, '#LayoutWrapper > div > div > div.stats-search-page.xpl-serp.ng2-app > div > xpl-root > header > xpl-header > div > xpl-navbar > div > div.top-navbar > div.left-side-container > div > div.xplore-logo-wrapper > xpl-xplore-logo > div > a > img'))
|
||
)
|
||
except TimeoutException:
|
||
print("[警告] 本页结果表格未出现,尝试继续")
|
||
time.sleep(2)
|
||
|
||
rows = driver.find_elements(By.CLASS_NAME, 'List-results-items')
|
||
print(f'有多少条论文信息:{len(rows)}')
|
||
for i in range(len(rows)):
|
||
print(f'第{i+1}条')
|
||
row = driver.find_elements(By.CLASS_NAME, 'List-results-items')[i]
|
||
if fetched_count >= limit:
|
||
break
|
||
try:
|
||
info = extract_row_info(row, driver)
|
||
if info: # 只有 info 有效才追加
|
||
results.append(info)
|
||
fetched_count += 1
|
||
time.sleep(2)
|
||
except Exception as e:
|
||
print(f"[错误] 抓取 row 失败: {e}")
|
||
traceback.print_exc()
|
||
# 如果窗口还存在,强制切回主窗口,避免死锁
|
||
try:
|
||
if driver.window_handles:
|
||
driver.switch_to.window(driver.window_handles[0])
|
||
except Exception:
|
||
pass
|
||
continue # 遇错后继续下一个 row,不要影响整体
|
||
|
||
# 翻页
|
||
try:
|
||
next_btn = driver.find_element(By.CSS_SELECTOR,"#xplMainContent > div.ng-SearchResults.row.g-0 > div.col > xpl-paginator > div.pagination-bar.hide-mobile.text-base-md-lh > ul > li.next-btn > button")
|
||
if next_btn.is_enabled() and fetched_count < limit:
|
||
_scroll_into_view(driver, next_btn)
|
||
try:
|
||
next_btn.click()
|
||
except Exception:
|
||
try:
|
||
ActionChains(driver).move_to_element(next_btn).pause(1).click(next_btn).perform()
|
||
except Exception:
|
||
driver.execute_script("arguments[0].click();", next_btn)
|
||
print("进入下一页")
|
||
time.sleep(random.uniform(1, 2))
|
||
else:
|
||
break
|
||
except NoSuchElementException:
|
||
print("[INFO] 已到最后一页或翻页不存在")
|
||
break
|
||
except Exception as e:
|
||
print(f"[错误] 翻页失败: {e}")
|
||
break
|
||
|
||
return results
|
||
def ieeeXplore(keyword, limit,sort_options=None):
|
||
"""主函数:三种排序抓取"""
|
||
driver = create_browser()
|
||
wait = WebDriverWait(driver, 15)
|
||
all_results = {}
|
||
if not sort_options:
|
||
sort_options = ["relevance"] # 默认相关性
|
||
|
||
try:
|
||
driver.get("https://ieeexplore.ieee.org/Xplore/home.jsp")
|
||
|
||
try:
|
||
accept_btn = driver.find_element(By.CSS_SELECTOR, "button.osano-cm-accept-all")
|
||
accept_btn.click()
|
||
print("有弹窗,点击已点击全部接受按钮")
|
||
# 等待弹窗消失
|
||
WebDriverWait(driver, 10).until(
|
||
EC.invisibility_of_element(accept_btn)
|
||
)
|
||
except NoSuchElementException:
|
||
# 没有弹窗就继续
|
||
print("未检测到全部接受按钮")
|
||
|
||
input_box = wait.until(EC.presence_of_element_located(
|
||
(By.CSS_SELECTOR, "input.Typeahead-input")
|
||
))
|
||
input_box.clear()
|
||
input_box.send_keys(keyword)
|
||
# 定位搜索按钮
|
||
search_btn = driver.find_element(By.CSS_SELECTOR, "button.fa.fa-search.stats-Global_Search_Icon")
|
||
search_btn.click()
|
||
time.sleep(4)
|
||
for sort_name in sort_options:
|
||
try:
|
||
if sort_name == "publication_time":
|
||
print("[INFO] 使用时间进行排序")
|
||
# 定位下拉按钮
|
||
dropdown_btn = wait.until(
|
||
EC.presence_of_element_located((By.CSS_SELECTOR, "button.dropdown-toggle.xpl-btn-secondary"))
|
||
)
|
||
# 滚动到下拉按钮可见
|
||
driver.execute_script("arguments[0].scrollIntoView(true);", dropdown_btn)
|
||
# 用 JS 点击下拉按钮(避免 ElementNotInteractable)
|
||
driver.execute_script("arguments[0].click();", dropdown_btn)
|
||
# 等待 0.5~1 秒让下拉选项渲染
|
||
time.sleep(1)
|
||
# 定位 "Newest" 选项
|
||
newest_option = wait.until(
|
||
EC.presence_of_element_located(
|
||
(By.XPATH,
|
||
"//button[contains(@class,'dropdown-item') and contains(normalize-space(.),'Newest')]")
|
||
)
|
||
)
|
||
# 用 JS 点击选项
|
||
driver.execute_script("arguments[0].click();", newest_option)
|
||
time.sleep(2)
|
||
elif sort_name == "relevance":
|
||
print("[INFO] 使用相关性排序(默认)")
|
||
|
||
except Exception as e:
|
||
print(f"[WARN] 点击排序 {sort_name} 失败:", e)
|
||
|
||
# 抓取当前排序下的结果
|
||
results = crawl_current_sort(driver, limit)
|
||
all_results[sort_name] = results
|
||
|
||
finally:
|
||
try:
|
||
driver.quit()
|
||
except Exception:
|
||
pass
|
||
print("[DONE] PDF处理完成")
|
||
print(json.dumps(all_results, ensure_ascii=False, indent=2))
|
||
|
||
return all_results
|
||
|
||
if __name__ == '__main__':
|
||
keyword = "bert"
|
||
limit=100
|
||
# ieeeXplore(keyword, limit, ["relevance"])
|
||
# 搜最新
|
||
# ieeeXplore(keyword, limit, ["publication_time"])
|
||
# 先相关性再最新
|
||
ieeeXplore(keyword, limit, ["relevance", "publication_time"])
|