# coding=utf-8 import csv import json import time import random import traceback import sys import os from bs4 import BeautifulSoup sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) from selenium.webdriver.common.by import By from selenium.webdriver.support.select import Select from selenium.webdriver.support.ui import WebDriverWait from selenium.webdriver.support import expected_conditions as EC from selenium.common.exceptions import ( NoSuchElementException, TimeoutException, ElementClickInterceptedException ) from selenium.webdriver.common.action_chains import ActionChains from config import create_browser,_scroll_into_view import json import re import requests from bs4 import BeautifulSoup def get_abstract_in_new_tab(url, headers=None, timeout=100): """ 通过论文详情页 URL 获取摘要(无需 Selenium) """ if headers is None: headers = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/140.0.0.0 Safari/537.36" } try: resp = requests.get(url, headers=headers, timeout=timeout) resp.raise_for_status() except requests.RequestException as e: print(f"[ERROR] 请求页面失败: {e}") return "" # 使用 BeautifulSoup 解析页面 soup = BeautifulSoup(resp.text, "lxml") # 从 JavaScript 元数据中提取摘要 match = re.search(r"xplGlobal\.document\.metadata\s*=\s*(\{.*?\});", resp.text, re.S) if match: metadata_json = match.group(1) try: metadata = json.loads(metadata_json) abstract = metadata.get("abstract", "") except json.JSONDecodeError: abstract = "" else: abstract = "" return abstract # ---------主函数 --------- def extract_row_info(row, driver): """抓取单条记录信息并返回字典""" try: urlIndex = row.find_element(By.CLASS_NAME, "fw-bold") relative_link = urlIndex.get_attribute("href") title = urlIndex.text.strip() base_url = "https://ieeexplore.ieee.org" if relative_link.startswith("/"): originalLink = base_url + relative_link else: originalLink = relative_link except Exception as e: print(f"[WARN] 获取论文标题或链接失败: {e}") title = "" originalLink = "" print("论文标题:", title) print("论文链接:", originalLink) try: authors = [a.text for a in row.find_elements(By.CSS_SELECTOR, 'xpl-authors-name-list a span')] authors = [a for a in authors if a.strip()] except Exception as e: print(f"[WARN] 获取作者列表失败: {e}") authors = [] print("作者列表:", authors) try: Conference = row.find_element(By.CSS_SELECTOR, "a[xplhighlight]").text.strip() print("Conference:", Conference) except: print("未找到会议信息") try: info_text = row.find_element(By.CSS_SELECTOR, "div.publisher-info-container").text # info_text 类似 "Year: 2025 | Conference Paper | Publisher: IEEE" parts = [p.strip() for p in info_text.split('|')] date = parts[0].replace("Year:", "").strip() paper_type = parts[1] if len(parts) > 1 else "" print(f"年份: {date}, 类型: {paper_type}") except: date = "" paper_type = "" print("未找到年份或类型") time.sleep(1) abstract = get_abstract_in_new_tab(originalLink) print("摘要:", abstract) time.sleep(2) # 等待页面加载完成 return { "title": title, # 确保函数里有定义 "originalLink": originalLink, "author": authors, "type":paper_type, "Conference":Conference, "date": date, "site":"ieeeXplore", "summary": abstract } def crawl_current_sort(driver, limit): """抓取当前排序下的 limit 条记录""" fetched_count = 0 results = [] while fetched_count < limit: try: WebDriverWait(driver, 10).until( EC.presence_of_element_located((By.CSS_SELECTOR, '#LayoutWrapper > div > div > div.stats-search-page.xpl-serp.ng2-app > div > xpl-root > header > xpl-header > div > xpl-navbar > div > div.top-navbar > div.left-side-container > div > div.xplore-logo-wrapper > xpl-xplore-logo > div > a > img')) ) except TimeoutException: print("[警告] 本页结果表格未出现,尝试继续") time.sleep(2) rows = driver.find_elements(By.CLASS_NAME, 'List-results-items') print(f'有多少条论文信息:{len(rows)}') for i in range(len(rows)): print(f'第{i+1}条') row = driver.find_elements(By.CLASS_NAME, 'List-results-items')[i] if fetched_count >= limit: break try: info = extract_row_info(row, driver) if info: # 只有 info 有效才追加 results.append(info) fetched_count += 1 time.sleep(2) except Exception as e: print(f"[错误] 抓取 row 失败: {e}") traceback.print_exc() # 如果窗口还存在,强制切回主窗口,避免死锁 try: if driver.window_handles: driver.switch_to.window(driver.window_handles[0]) except Exception: pass continue # 遇错后继续下一个 row,不要影响整体 # 翻页 try: next_btn = driver.find_element(By.CSS_SELECTOR,"#xplMainContent > div.ng-SearchResults.row.g-0 > div.col > xpl-paginator > div.pagination-bar.hide-mobile.text-base-md-lh > ul > li.next-btn > button") if next_btn.is_enabled() and fetched_count < limit: _scroll_into_view(driver, next_btn) try: next_btn.click() except Exception: try: ActionChains(driver).move_to_element(next_btn).pause(1).click(next_btn).perform() except Exception: driver.execute_script("arguments[0].click();", next_btn) print("进入下一页") time.sleep(random.uniform(1, 2)) else: break except NoSuchElementException: print("[INFO] 已到最后一页或翻页不存在") break except Exception as e: print(f"[错误] 翻页失败: {e}") break return results def ieeeXplore(keyword, limit,sort_options=None): """主函数:三种排序抓取""" driver = create_browser() wait = WebDriverWait(driver, 15) all_results = {} if not sort_options: sort_options = ["relevance"] # 默认相关性 try: driver.get("https://ieeexplore.ieee.org/Xplore/home.jsp") try: accept_btn = driver.find_element(By.CSS_SELECTOR, "button.osano-cm-accept-all") accept_btn.click() print("有弹窗,点击已点击全部接受按钮") # 等待弹窗消失 WebDriverWait(driver, 10).until( EC.invisibility_of_element(accept_btn) ) except NoSuchElementException: # 没有弹窗就继续 print("未检测到全部接受按钮") input_box = wait.until(EC.presence_of_element_located( (By.CSS_SELECTOR, "input.Typeahead-input") )) input_box.clear() input_box.send_keys(keyword) # 定位搜索按钮 search_btn = driver.find_element(By.CSS_SELECTOR, "button.fa.fa-search.stats-Global_Search_Icon") search_btn.click() time.sleep(4) for sort_name in sort_options: try: if sort_name == "publication_time": print("[INFO] 使用时间进行排序") # 定位下拉按钮 dropdown_btn = wait.until( EC.presence_of_element_located((By.CSS_SELECTOR, "button.dropdown-toggle.xpl-btn-secondary")) ) # 滚动到下拉按钮可见 driver.execute_script("arguments[0].scrollIntoView(true);", dropdown_btn) # 用 JS 点击下拉按钮(避免 ElementNotInteractable) driver.execute_script("arguments[0].click();", dropdown_btn) # 等待 0.5~1 秒让下拉选项渲染 time.sleep(1) # 定位 "Newest" 选项 newest_option = wait.until( EC.presence_of_element_located( (By.XPATH, "//button[contains(@class,'dropdown-item') and contains(normalize-space(.),'Newest')]") ) ) # 用 JS 点击选项 driver.execute_script("arguments[0].click();", newest_option) time.sleep(2) elif sort_name == "relevance": print("[INFO] 使用相关性排序(默认)") except Exception as e: print(f"[WARN] 点击排序 {sort_name} 失败:", e) # 抓取当前排序下的结果 results = crawl_current_sort(driver, limit) all_results[sort_name] = results finally: try: driver.quit() except Exception: pass print("[DONE] PDF处理完成") print(json.dumps(all_results, ensure_ascii=False, indent=2)) return all_results if __name__ == '__main__': keyword = "bert" limit=100 # ieeeXplore(keyword, limit, ["relevance"]) # 搜最新 # ieeeXplore(keyword, limit, ["publication_time"]) # 先相关性再最新 ieeeXplore(keyword, limit, ["relevance", "publication_time"])