selenium_keyan/selenium/utils/ieeeXplore.py

267 lines
9.9 KiB
Python
Raw Permalink Normal View History

# coding=utf-8
import csv
import json
import time
import random
import traceback
import sys
import os
from bs4 import BeautifulSoup
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
from selenium.webdriver.common.by import By
from selenium.webdriver.support.select import Select
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import (
NoSuchElementException,
TimeoutException,
ElementClickInterceptedException
)
from selenium.webdriver.common.action_chains import ActionChains
from config import create_browser,_scroll_into_view
import json
import re
import requests
from bs4 import BeautifulSoup
def get_abstract_in_new_tab(url, headers=None, timeout=100):
"""
通过论文详情页 URL 获取摘要无需 Selenium
"""
if headers is None:
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/140.0.0.0 Safari/537.36"
}
try:
resp = requests.get(url, headers=headers, timeout=timeout)
resp.raise_for_status()
except requests.RequestException as e:
print(f"[ERROR] 请求页面失败: {e}")
return ""
# 使用 BeautifulSoup 解析页面
soup = BeautifulSoup(resp.text, "lxml")
# 从 JavaScript 元数据中提取摘要
match = re.search(r"xplGlobal\.document\.metadata\s*=\s*(\{.*?\});", resp.text, re.S)
if match:
metadata_json = match.group(1)
try:
metadata = json.loads(metadata_json)
abstract = metadata.get("abstract", "")
except json.JSONDecodeError:
abstract = ""
else:
abstract = ""
return abstract
# ---------主函数 ---------
def extract_row_info(row, driver):
"""抓取单条记录信息并返回字典"""
try:
urlIndex = row.find_element(By.CLASS_NAME, "fw-bold")
relative_link = urlIndex.get_attribute("href")
title = urlIndex.text.strip()
base_url = "https://ieeexplore.ieee.org"
if relative_link.startswith("/"):
originalLink = base_url + relative_link
else:
originalLink = relative_link
except Exception as e:
print(f"[WARN] 获取论文标题或链接失败: {e}")
title = ""
originalLink = ""
print("论文标题:", title)
print("论文链接:", originalLink)
try:
authors = [a.text for a in row.find_elements(By.CSS_SELECTOR, 'xpl-authors-name-list a span')]
authors = [a for a in authors if a.strip()]
except Exception as e:
print(f"[WARN] 获取作者列表失败: {e}")
authors = []
print("作者列表:", authors)
try:
Conference = row.find_element(By.CSS_SELECTOR, "a[xplhighlight]").text.strip()
print("Conference:", Conference)
except:
print("未找到会议信息")
try:
info_text = row.find_element(By.CSS_SELECTOR, "div.publisher-info-container").text
# info_text 类似 "Year: 2025 | Conference Paper | Publisher: IEEE"
parts = [p.strip() for p in info_text.split('|')]
date = parts[0].replace("Year:", "").strip()
paper_type = parts[1] if len(parts) > 1 else ""
print(f"年份: {date}, 类型: {paper_type}")
except:
date = ""
paper_type = ""
print("未找到年份或类型")
time.sleep(1)
abstract = get_abstract_in_new_tab(originalLink)
print("摘要:", abstract)
time.sleep(2) # 等待页面加载完成
return {
"title": title, # 确保函数里有定义
"originalLink": originalLink,
"author": authors,
"type":paper_type,
"Conference":Conference,
"date": date,
"site":"ieeeXplore",
"summary": abstract
}
def crawl_current_sort(driver, limit):
"""抓取当前排序下的 limit 条记录"""
fetched_count = 0
results = []
while fetched_count < limit:
try:
WebDriverWait(driver, 10).until(
EC.presence_of_element_located((By.CSS_SELECTOR, '#LayoutWrapper > div > div > div.stats-search-page.xpl-serp.ng2-app > div > xpl-root > header > xpl-header > div > xpl-navbar > div > div.top-navbar > div.left-side-container > div > div.xplore-logo-wrapper > xpl-xplore-logo > div > a > img'))
)
except TimeoutException:
print("[警告] 本页结果表格未出现,尝试继续")
time.sleep(2)
rows = driver.find_elements(By.CLASS_NAME, 'List-results-items')
print(f'有多少条论文信息:{len(rows)}')
for i in range(len(rows)):
print(f'{i+1}')
row = driver.find_elements(By.CLASS_NAME, 'List-results-items')[i]
if fetched_count >= limit:
break
try:
info = extract_row_info(row, driver)
if info: # 只有 info 有效才追加
results.append(info)
fetched_count += 1
time.sleep(2)
except Exception as e:
print(f"[错误] 抓取 row 失败: {e}")
traceback.print_exc()
# 如果窗口还存在,强制切回主窗口,避免死锁
try:
if driver.window_handles:
driver.switch_to.window(driver.window_handles[0])
except Exception:
pass
continue # 遇错后继续下一个 row不要影响整体
# 翻页
try:
next_btn = driver.find_element(By.CSS_SELECTOR,"#xplMainContent > div.ng-SearchResults.row.g-0 > div.col > xpl-paginator > div.pagination-bar.hide-mobile.text-base-md-lh > ul > li.next-btn > button")
if next_btn.is_enabled() and fetched_count < limit:
_scroll_into_view(driver, next_btn)
try:
next_btn.click()
except Exception:
try:
ActionChains(driver).move_to_element(next_btn).pause(1).click(next_btn).perform()
except Exception:
driver.execute_script("arguments[0].click();", next_btn)
print("进入下一页")
time.sleep(random.uniform(1, 2))
else:
break
except NoSuchElementException:
print("[INFO] 已到最后一页或翻页不存在")
break
except Exception as e:
print(f"[错误] 翻页失败: {e}")
break
return results
def ieeeXplore(keyword, limit,sort_options=None):
"""主函数:三种排序抓取"""
driver = create_browser()
wait = WebDriverWait(driver, 15)
all_results = {}
if not sort_options:
sort_options = ["relevance"] # 默认相关性
try:
driver.get("https://ieeexplore.ieee.org/Xplore/home.jsp")
try:
accept_btn = driver.find_element(By.CSS_SELECTOR, "button.osano-cm-accept-all")
accept_btn.click()
print("有弹窗,点击已点击全部接受按钮")
# 等待弹窗消失
WebDriverWait(driver, 10).until(
EC.invisibility_of_element(accept_btn)
)
except NoSuchElementException:
# 没有弹窗就继续
print("未检测到全部接受按钮")
input_box = wait.until(EC.presence_of_element_located(
(By.CSS_SELECTOR, "input.Typeahead-input")
))
input_box.clear()
input_box.send_keys(keyword)
# 定位搜索按钮
search_btn = driver.find_element(By.CSS_SELECTOR, "button.fa.fa-search.stats-Global_Search_Icon")
search_btn.click()
time.sleep(4)
for sort_name in sort_options:
try:
if sort_name == "publication_time":
print("[INFO] 使用时间进行排序")
# 定位下拉按钮
dropdown_btn = wait.until(
EC.presence_of_element_located((By.CSS_SELECTOR, "button.dropdown-toggle.xpl-btn-secondary"))
)
# 滚动到下拉按钮可见
driver.execute_script("arguments[0].scrollIntoView(true);", dropdown_btn)
# 用 JS 点击下拉按钮(避免 ElementNotInteractable
driver.execute_script("arguments[0].click();", dropdown_btn)
# 等待 0.5~1 秒让下拉选项渲染
time.sleep(1)
# 定位 "Newest" 选项
newest_option = wait.until(
EC.presence_of_element_located(
(By.XPATH,
"//button[contains(@class,'dropdown-item') and contains(normalize-space(.),'Newest')]")
)
)
# 用 JS 点击选项
driver.execute_script("arguments[0].click();", newest_option)
time.sleep(2)
elif sort_name == "relevance":
print("[INFO] 使用相关性排序(默认)")
except Exception as e:
print(f"[WARN] 点击排序 {sort_name} 失败:", e)
# 抓取当前排序下的结果
results = crawl_current_sort(driver, limit)
all_results[sort_name] = results
finally:
try:
driver.quit()
except Exception:
pass
print("[DONE] PDF处理完成")
print(json.dumps(all_results, ensure_ascii=False, indent=2))
return all_results
if __name__ == '__main__':
keyword = "bert"
limit=100
# ieeeXplore(keyword, limit, ["relevance"])
# 搜最新
# ieeeXplore(keyword, limit, ["publication_time"])
# 先相关性再最新
ieeeXplore(keyword, limit, ["relevance", "publication_time"])