selenium_keyan/selenium/utils/scienceDirect.py

210 lines
7.4 KiB
Python
Executable File
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

# coding=utf-8
import json
import time
import random
import traceback
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import (
TimeoutException, NoSuchElementException,
)
from selenium.webdriver.common.action_chains import ActionChains
from config import create_browser,_scroll_into_view
def find_valid_detail_tab(driver, origin_handle, timeout=10):
"""从现有句柄中挑出一个不是原始窗口的有效新标签页。"""
end_time = time.time() + timeout
while time.time() < end_time:
for handle in driver.window_handles:
if handle != origin_handle:
try:
driver.switch_to.window(handle)
current_url = driver.current_url
if not current_url.startswith("chrome://") and current_url != "about:blank":
print(f"[切换窗口] 成功 → {driver.title}")
return handle
except Exception:
pass
time.sleep(0.5)
raise Exception("未能在规定时间内找到有效详情页窗口")
# ---------主函数 ---------
def extract_row_info(row, driver):
"""抓取单条记录信息并返回字典"""
try:
type_text = row.find_element(By.XPATH, 'div / div / div[1] / span').text.strip()
except Exception:
type_text = ""
# 如果不是期刊论文,直接跳过
if type_text != "Research article":
return None
title_element = row.find_element(By.XPATH, './/h2/a/span/span/span')
title = title_element.text.strip()
print("论文名称",title)
try:
# 提取期刊名
journal_element = row.find_element(By.XPATH, './/div[@class="SubType hor text-xs u-clr-grey6"]//a')
source = journal_element.text.strip()
# 提取时间(在 srctitle-date-fields 里,期刊名后面的 span
time_element = row.find_element(By.XPATH,'.//div[@class="SubType hor text-xs u-clr-grey6"]//span[@class="srctitle-date-fields"]/span[last()]')
date = time_element.text.strip()
except:
source=""
date=""
print(f"未找到期刊或时间")
print(f"期刊: {source} | 时间: {date}")
print("类型:", type_text)
time.sleep(1)
origin = driver.current_window_handle
existing_handles = driver.window_handles
try:
_scroll_into_view(driver, title_element)
title_element.click()
except Exception:
try:
ActionChains(driver).move_to_element(title_element).pause(1).click(title_element).perform()
except Exception:
driver.execute_script("arguments[0].click();", title_element)
try:
WebDriverWait(driver, 10).until(EC.new_window_is_opened(existing_handles))
except TimeoutException:
print("[警告] 未检测到新窗口,跳过")
return None
try:
detail_tab = find_valid_detail_tab(driver, origin)
if detail_tab not in driver.window_handles:
return None
driver.switch_to.window(detail_tab)
time.sleep(3)
#获取摘要信息
abstract_elem = driver.find_elements(By.CSS_SELECTOR, "#sp0010")
summary_text = abstract_elem.text.strip()
print("摘要:", summary_text)
authors = driver.find_elements(By.CSS_SELECTOR, "#author-group .react-xocs-alternative-link")
author_names = [a.text.strip() for a in authors if a.text.strip()]
print(author_names)
time.sleep(1)
finally:
# 仅关闭非原窗口
if driver.current_window_handle != origin:
driver.close()
driver.switch_to.window(origin)
time.sleep(random.uniform(0.5, 1.5))
return {
"title": title, # 确保函数里有定义
"author": author_names,
"source": source,
"summary": summary_text
}
def crawl_current_sort(driver, limit):
"""抓取当前排序下的 limit 条记录"""
fetched_count = 0
results = []
while fetched_count < limit:
try:
WebDriverWait(driver, 10).until(
EC.presence_of_element_located((By.XPATH, '//*[@id="main_content"]/div[3]/div[1]/div[2]/div[2]'))
)
except TimeoutException:
print("[警告] 本页结果表格未出现,尝试继续")
time.sleep(2)
rows = driver.find_elements(By.XPATH, '// *[ @ id = "srp-results-list"] / ol / li')
for row in rows:
if fetched_count >= limit:
break
try:
info = extract_row_info(row, driver)
if info:
results.append(info)
time.sleep(2)
fetched_count += 1
except Exception as e:
print(f"[错误] {e}")
traceback.print_exc()
try:
if driver.window_handles:
driver.switch_to.window(driver.window_handles[0])
except Exception:
pass
# 翻页
try:
next_btn = driver.find_element(By.XPATH, "//*[@id='srp-pagination']/li[2]/a/span")
if not next_btn.is_enabled() or fetched_count >= limit:
break
_scroll_into_view(driver, next_btn)
try:
next_btn.click()
except Exception:
try:
ActionChains(driver).move_to_element(next_btn).pause(1).click(next_btn).perform()
except Exception:
driver.execute_script("arguments[0].click();", next_btn)
time.sleep(5)
except Exception:
print("[INFO] 已到最后一页或翻页失败")
break
return results
def scienceDirect(keyword, limit):
"""主函数:三种排序抓取"""
driver = create_browser()
wait = WebDriverWait(driver, 15)
all_results = {}
sortings = {
"relevance": None,
"publication_time": "#srp-sorting-options > div > a > span",
}
try:
driver.get("https://www.sciencedirect.com/")
search_input = WebDriverWait(driver, 10).until(
EC.presence_of_element_located((By.ID, "qs"))
)
search_input.send_keys(keyword)
time.sleep(2)
search_button = driver.find_element(By.XPATH, '//*[@id="searchbar"]/div/div/form/div[2]/button')
search_button.click()
time.sleep(3)
for sort_name, css_selector in sortings.items():
if css_selector:
try:
driver.find_element(By.CSS_SELECTOR, css_selector).click()
time.sleep(5)
except Exception:
print(f"[WARN] 点击排序 {sort_name} 失败")
results = crawl_current_sort(driver, limit)
all_results[sort_name] = results
finally:
try:
driver.quit()
except Exception:
pass
print("[DONE] PDF处理完成")
print(json.dumps(all_results, ensure_ascii=False, indent=2))
return json.dumps(all_results, ensure_ascii=False, indent=2)
if __name__ == '__main__':
keyword = "graphrag"
limit=10
scienceDirect(keyword,limit)