selenium_keyan/selenium/utils/ieeeXplore.py

267 lines
9.9 KiB
Python
Executable File
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

# coding=utf-8
import csv
import json
import time
import random
import traceback
import sys
import os
from bs4 import BeautifulSoup
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
from selenium.webdriver.common.by import By
from selenium.webdriver.support.select import Select
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import (
NoSuchElementException,
TimeoutException,
ElementClickInterceptedException
)
from selenium.webdriver.common.action_chains import ActionChains
from config import create_browser,_scroll_into_view
import json
import re
import requests
from bs4 import BeautifulSoup
def get_abstract_in_new_tab(url, headers=None, timeout=100):
"""
通过论文详情页 URL 获取摘要(无需 Selenium
"""
if headers is None:
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/140.0.0.0 Safari/537.36"
}
try:
resp = requests.get(url, headers=headers, timeout=timeout)
resp.raise_for_status()
except requests.RequestException as e:
print(f"[ERROR] 请求页面失败: {e}")
return ""
# 使用 BeautifulSoup 解析页面
soup = BeautifulSoup(resp.text, "lxml")
# 从 JavaScript 元数据中提取摘要
match = re.search(r"xplGlobal\.document\.metadata\s*=\s*(\{.*?\});", resp.text, re.S)
if match:
metadata_json = match.group(1)
try:
metadata = json.loads(metadata_json)
abstract = metadata.get("abstract", "")
except json.JSONDecodeError:
abstract = ""
else:
abstract = ""
return abstract
# ---------主函数 ---------
def extract_row_info(row, driver):
"""抓取单条记录信息并返回字典"""
try:
urlIndex = row.find_element(By.CLASS_NAME, "fw-bold")
relative_link = urlIndex.get_attribute("href")
title = urlIndex.text.strip()
base_url = "https://ieeexplore.ieee.org"
if relative_link.startswith("/"):
originalLink = base_url + relative_link
else:
originalLink = relative_link
except Exception as e:
print(f"[WARN] 获取论文标题或链接失败: {e}")
title = ""
originalLink = ""
print("论文标题:", title)
print("论文链接:", originalLink)
try:
authors = [a.text for a in row.find_elements(By.CSS_SELECTOR, 'xpl-authors-name-list a span')]
authors = [a for a in authors if a.strip()]
except Exception as e:
print(f"[WARN] 获取作者列表失败: {e}")
authors = []
print("作者列表:", authors)
try:
Conference = row.find_element(By.CSS_SELECTOR, "a[xplhighlight]").text.strip()
print("Conference:", Conference)
except:
print("未找到会议信息")
try:
info_text = row.find_element(By.CSS_SELECTOR, "div.publisher-info-container").text
# info_text 类似 "Year: 2025 | Conference Paper | Publisher: IEEE"
parts = [p.strip() for p in info_text.split('|')]
date = parts[0].replace("Year:", "").strip()
paper_type = parts[1] if len(parts) > 1 else ""
print(f"年份: {date}, 类型: {paper_type}")
except:
date = ""
paper_type = ""
print("未找到年份或类型")
time.sleep(1)
abstract = get_abstract_in_new_tab(originalLink)
print("摘要:", abstract)
time.sleep(2) # 等待页面加载完成
return {
"title": title, # 确保函数里有定义
"originalLink": originalLink,
"author": authors,
"type":paper_type,
"Conference":Conference,
"date": date,
"site":"ieeeXplore",
"summary": abstract
}
def crawl_current_sort(driver, limit):
"""抓取当前排序下的 limit 条记录"""
fetched_count = 0
results = []
while fetched_count < limit:
try:
WebDriverWait(driver, 10).until(
EC.presence_of_element_located((By.CSS_SELECTOR, '#LayoutWrapper > div > div > div.stats-search-page.xpl-serp.ng2-app > div > xpl-root > header > xpl-header > div > xpl-navbar > div > div.top-navbar > div.left-side-container > div > div.xplore-logo-wrapper > xpl-xplore-logo > div > a > img'))
)
except TimeoutException:
print("[警告] 本页结果表格未出现,尝试继续")
time.sleep(2)
rows = driver.find_elements(By.CLASS_NAME, 'List-results-items')
print(f'有多少条论文信息:{len(rows)}')
for i in range(len(rows)):
print(f'{i+1}')
row = driver.find_elements(By.CLASS_NAME, 'List-results-items')[i]
if fetched_count >= limit:
break
try:
info = extract_row_info(row, driver)
if info: # 只有 info 有效才追加
results.append(info)
fetched_count += 1
time.sleep(2)
except Exception as e:
print(f"[错误] 抓取 row 失败: {e}")
traceback.print_exc()
# 如果窗口还存在,强制切回主窗口,避免死锁
try:
if driver.window_handles:
driver.switch_to.window(driver.window_handles[0])
except Exception:
pass
continue # 遇错后继续下一个 row不要影响整体
# 翻页
try:
next_btn = driver.find_element(By.CSS_SELECTOR,"#xplMainContent > div.ng-SearchResults.row.g-0 > div.col > xpl-paginator > div.pagination-bar.hide-mobile.text-base-md-lh > ul > li.next-btn > button")
if next_btn.is_enabled() and fetched_count < limit:
_scroll_into_view(driver, next_btn)
try:
next_btn.click()
except Exception:
try:
ActionChains(driver).move_to_element(next_btn).pause(1).click(next_btn).perform()
except Exception:
driver.execute_script("arguments[0].click();", next_btn)
print("进入下一页")
time.sleep(random.uniform(1, 2))
else:
break
except NoSuchElementException:
print("[INFO] 已到最后一页或翻页不存在")
break
except Exception as e:
print(f"[错误] 翻页失败: {e}")
break
return results
def ieeeXplore(keyword, limit,sort_options=None):
"""主函数:三种排序抓取"""
driver = create_browser()
wait = WebDriverWait(driver, 15)
all_results = {}
if not sort_options:
sort_options = ["relevance"] # 默认相关性
try:
driver.get("https://ieeexplore.ieee.org/Xplore/home.jsp")
try:
accept_btn = driver.find_element(By.CSS_SELECTOR, "button.osano-cm-accept-all")
accept_btn.click()
print("有弹窗,点击已点击全部接受按钮")
# 等待弹窗消失
WebDriverWait(driver, 10).until(
EC.invisibility_of_element(accept_btn)
)
except NoSuchElementException:
# 没有弹窗就继续
print("未检测到全部接受按钮")
input_box = wait.until(EC.presence_of_element_located(
(By.CSS_SELECTOR, "input.Typeahead-input")
))
input_box.clear()
input_box.send_keys(keyword)
# 定位搜索按钮
search_btn = driver.find_element(By.CSS_SELECTOR, "button.fa.fa-search.stats-Global_Search_Icon")
search_btn.click()
time.sleep(4)
for sort_name in sort_options:
try:
if sort_name == "publication_time":
print("[INFO] 使用时间进行排序")
# 定位下拉按钮
dropdown_btn = wait.until(
EC.presence_of_element_located((By.CSS_SELECTOR, "button.dropdown-toggle.xpl-btn-secondary"))
)
# 滚动到下拉按钮可见
driver.execute_script("arguments[0].scrollIntoView(true);", dropdown_btn)
# 用 JS 点击下拉按钮(避免 ElementNotInteractable
driver.execute_script("arguments[0].click();", dropdown_btn)
# 等待 0.5~1 秒让下拉选项渲染
time.sleep(1)
# 定位 "Newest" 选项
newest_option = wait.until(
EC.presence_of_element_located(
(By.XPATH,
"//button[contains(@class,'dropdown-item') and contains(normalize-space(.),'Newest')]")
)
)
# 用 JS 点击选项
driver.execute_script("arguments[0].click();", newest_option)
time.sleep(2)
elif sort_name == "relevance":
print("[INFO] 使用相关性排序(默认)")
except Exception as e:
print(f"[WARN] 点击排序 {sort_name} 失败:", e)
# 抓取当前排序下的结果
results = crawl_current_sort(driver, limit)
all_results[sort_name] = results
finally:
try:
driver.quit()
except Exception:
pass
print("[DONE] PDF处理完成")
print(json.dumps(all_results, ensure_ascii=False, indent=2))
return all_results
if __name__ == '__main__':
keyword = "bert"
limit=100
# ieeeXplore(keyword, limit, ["relevance"])
# 搜最新
# ieeeXplore(keyword, limit, ["publication_time"])
# 先相关性再最新
ieeeXplore(keyword, limit, ["relevance", "publication_time"])