# coding=utf-8 import json import time import random import traceback from selenium.webdriver.common.by import By from selenium.webdriver.support.ui import WebDriverWait from selenium.webdriver.support import expected_conditions as EC from selenium.common.exceptions import ( TimeoutException, NoSuchElementException, ) from selenium.webdriver.common.action_chains import ActionChains from config import create_browser,_scroll_into_view def find_valid_detail_tab(driver, origin_handle, timeout=10): """从现有句柄中挑出一个不是原始窗口的有效新标签页。""" end_time = time.time() + timeout while time.time() < end_time: for handle in driver.window_handles: if handle != origin_handle: try: driver.switch_to.window(handle) current_url = driver.current_url if not current_url.startswith("chrome://") and current_url != "about:blank": print(f"[切换窗口] 成功 → {driver.title}") return handle except Exception: pass time.sleep(0.5) raise Exception("未能在规定时间内找到有效详情页窗口") # ---------主函数 --------- def extract_row_info(row, driver): """抓取单条记录信息并返回字典""" try: type_text = row.find_element(By.XPATH, 'div / div / div[1] / span').text.strip() except Exception: type_text = "" # 如果不是期刊论文,直接跳过 if type_text != "Research article": return None title_element = row.find_element(By.XPATH, './/h2/a/span/span/span') title = title_element.text.strip() print("论文名称",title) try: # 提取期刊名 journal_element = row.find_element(By.XPATH, './/div[@class="SubType hor text-xs u-clr-grey6"]//a') source = journal_element.text.strip() # 提取时间(在 srctitle-date-fields 里,期刊名后面的 span) time_element = row.find_element(By.XPATH,'.//div[@class="SubType hor text-xs u-clr-grey6"]//span[@class="srctitle-date-fields"]/span[last()]') date = time_element.text.strip() except: source="" date="" print(f"未找到期刊或时间") print(f"期刊: {source} | 时间: {date}") print("类型:", type_text) time.sleep(1) origin = driver.current_window_handle existing_handles = driver.window_handles try: _scroll_into_view(driver, title_element) title_element.click() except Exception: try: ActionChains(driver).move_to_element(title_element).pause(1).click(title_element).perform() except Exception: driver.execute_script("arguments[0].click();", title_element) try: WebDriverWait(driver, 10).until(EC.new_window_is_opened(existing_handles)) except TimeoutException: print("[警告] 未检测到新窗口,跳过") return None try: detail_tab = find_valid_detail_tab(driver, origin) if detail_tab not in driver.window_handles: return None driver.switch_to.window(detail_tab) time.sleep(3) #获取摘要信息 abstract_elem = driver.find_elements(By.CSS_SELECTOR, "#sp0010") summary_text = abstract_elem.text.strip() print("摘要:", summary_text) authors = driver.find_elements(By.CSS_SELECTOR, "#author-group .react-xocs-alternative-link") author_names = [a.text.strip() for a in authors if a.text.strip()] print(author_names) time.sleep(1) finally: # 仅关闭非原窗口 if driver.current_window_handle != origin: driver.close() driver.switch_to.window(origin) time.sleep(random.uniform(0.5, 1.5)) return { "title": title, # 确保函数里有定义 "author": author_names, "source": source, "summary": summary_text } def crawl_current_sort(driver, limit): """抓取当前排序下的 limit 条记录""" fetched_count = 0 results = [] while fetched_count < limit: try: WebDriverWait(driver, 10).until( EC.presence_of_element_located((By.XPATH, '//*[@id="main_content"]/div[3]/div[1]/div[2]/div[2]')) ) except TimeoutException: print("[警告] 本页结果表格未出现,尝试继续") time.sleep(2) rows = driver.find_elements(By.XPATH, '// *[ @ id = "srp-results-list"] / ol / li') for row in rows: if fetched_count >= limit: break try: info = extract_row_info(row, driver) if info: results.append(info) time.sleep(2) fetched_count += 1 except Exception as e: print(f"[错误] {e}") traceback.print_exc() try: if driver.window_handles: driver.switch_to.window(driver.window_handles[0]) except Exception: pass # 翻页 try: next_btn = driver.find_element(By.XPATH, "//*[@id='srp-pagination']/li[2]/a/span") if not next_btn.is_enabled() or fetched_count >= limit: break _scroll_into_view(driver, next_btn) try: next_btn.click() except Exception: try: ActionChains(driver).move_to_element(next_btn).pause(1).click(next_btn).perform() except Exception: driver.execute_script("arguments[0].click();", next_btn) time.sleep(5) except Exception: print("[INFO] 已到最后一页或翻页失败") break return results def scienceDirect(keyword, limit): """主函数:三种排序抓取""" driver = create_browser() wait = WebDriverWait(driver, 15) all_results = {} sortings = { "relevance": None, "publication_time": "#srp-sorting-options > div > a > span", } try: driver.get("https://www.sciencedirect.com/") search_input = WebDriverWait(driver, 10).until( EC.presence_of_element_located((By.ID, "qs")) ) search_input.send_keys(keyword) time.sleep(2) search_button = driver.find_element(By.XPATH, '//*[@id="searchbar"]/div/div/form/div[2]/button') search_button.click() time.sleep(3) for sort_name, css_selector in sortings.items(): if css_selector: try: driver.find_element(By.CSS_SELECTOR, css_selector).click() time.sleep(5) except Exception: print(f"[WARN] 点击排序 {sort_name} 失败") results = crawl_current_sort(driver, limit) all_results[sort_name] = results finally: try: driver.quit() except Exception: pass print("[DONE] PDF处理完成") print(json.dumps(all_results, ensure_ascii=False, indent=2)) return json.dumps(all_results, ensure_ascii=False, indent=2) if __name__ == '__main__': keyword = "graphrag" limit=10 scienceDirect(keyword,limit)