210 lines
7.4 KiB
Python
Executable File
210 lines
7.4 KiB
Python
Executable File
# coding=utf-8
|
||
import json
|
||
import time
|
||
import random
|
||
import traceback
|
||
from selenium.webdriver.common.by import By
|
||
from selenium.webdriver.support.ui import WebDriverWait
|
||
from selenium.webdriver.support import expected_conditions as EC
|
||
from selenium.common.exceptions import (
|
||
TimeoutException, NoSuchElementException,
|
||
)
|
||
from selenium.webdriver.common.action_chains import ActionChains
|
||
from config import create_browser,_scroll_into_view
|
||
def find_valid_detail_tab(driver, origin_handle, timeout=10):
|
||
"""从现有句柄中挑出一个不是原始窗口的有效新标签页。"""
|
||
end_time = time.time() + timeout
|
||
while time.time() < end_time:
|
||
for handle in driver.window_handles:
|
||
if handle != origin_handle:
|
||
try:
|
||
driver.switch_to.window(handle)
|
||
current_url = driver.current_url
|
||
if not current_url.startswith("chrome://") and current_url != "about:blank":
|
||
print(f"[切换窗口] 成功 → {driver.title}")
|
||
return handle
|
||
except Exception:
|
||
pass
|
||
time.sleep(0.5)
|
||
raise Exception("未能在规定时间内找到有效详情页窗口")
|
||
# ---------主函数 ---------
|
||
def extract_row_info(row, driver):
|
||
"""抓取单条记录信息并返回字典"""
|
||
|
||
try:
|
||
type_text = row.find_element(By.XPATH, 'div / div / div[1] / span').text.strip()
|
||
except Exception:
|
||
type_text = ""
|
||
# 如果不是期刊论文,直接跳过
|
||
if type_text != "Research article":
|
||
return None
|
||
title_element = row.find_element(By.XPATH, './/h2/a/span/span/span')
|
||
title = title_element.text.strip()
|
||
print("论文名称",title)
|
||
|
||
try:
|
||
# 提取期刊名
|
||
journal_element = row.find_element(By.XPATH, './/div[@class="SubType hor text-xs u-clr-grey6"]//a')
|
||
source = journal_element.text.strip()
|
||
# 提取时间(在 srctitle-date-fields 里,期刊名后面的 span)
|
||
time_element = row.find_element(By.XPATH,'.//div[@class="SubType hor text-xs u-clr-grey6"]//span[@class="srctitle-date-fields"]/span[last()]')
|
||
date = time_element.text.strip()
|
||
|
||
except:
|
||
source=""
|
||
date=""
|
||
print(f"未找到期刊或时间")
|
||
|
||
print(f"期刊: {source} | 时间: {date}")
|
||
print("类型:", type_text)
|
||
|
||
time.sleep(1)
|
||
origin = driver.current_window_handle
|
||
existing_handles = driver.window_handles
|
||
try:
|
||
_scroll_into_view(driver, title_element)
|
||
title_element.click()
|
||
except Exception:
|
||
try:
|
||
ActionChains(driver).move_to_element(title_element).pause(1).click(title_element).perform()
|
||
except Exception:
|
||
driver.execute_script("arguments[0].click();", title_element)
|
||
|
||
try:
|
||
WebDriverWait(driver, 10).until(EC.new_window_is_opened(existing_handles))
|
||
except TimeoutException:
|
||
print("[警告] 未检测到新窗口,跳过")
|
||
return None
|
||
|
||
try:
|
||
detail_tab = find_valid_detail_tab(driver, origin)
|
||
if detail_tab not in driver.window_handles:
|
||
return None
|
||
driver.switch_to.window(detail_tab)
|
||
time.sleep(3)
|
||
#获取摘要信息
|
||
abstract_elem = driver.find_elements(By.CSS_SELECTOR, "#sp0010")
|
||
summary_text = abstract_elem.text.strip()
|
||
print("摘要:", summary_text)
|
||
authors = driver.find_elements(By.CSS_SELECTOR, "#author-group .react-xocs-alternative-link")
|
||
|
||
author_names = [a.text.strip() for a in authors if a.text.strip()]
|
||
print(author_names)
|
||
|
||
time.sleep(1)
|
||
finally:
|
||
# 仅关闭非原窗口
|
||
if driver.current_window_handle != origin:
|
||
driver.close()
|
||
driver.switch_to.window(origin)
|
||
time.sleep(random.uniform(0.5, 1.5))
|
||
|
||
return {
|
||
"title": title, # 确保函数里有定义
|
||
"author": author_names,
|
||
"source": source,
|
||
"summary": summary_text
|
||
}
|
||
def crawl_current_sort(driver, limit):
|
||
"""抓取当前排序下的 limit 条记录"""
|
||
fetched_count = 0
|
||
results = []
|
||
|
||
while fetched_count < limit:
|
||
try:
|
||
WebDriverWait(driver, 10).until(
|
||
EC.presence_of_element_located((By.XPATH, '//*[@id="main_content"]/div[3]/div[1]/div[2]/div[2]'))
|
||
)
|
||
except TimeoutException:
|
||
print("[警告] 本页结果表格未出现,尝试继续")
|
||
time.sleep(2)
|
||
|
||
rows = driver.find_elements(By.XPATH, '// *[ @ id = "srp-results-list"] / ol / li')
|
||
|
||
for row in rows:
|
||
if fetched_count >= limit:
|
||
break
|
||
try:
|
||
info = extract_row_info(row, driver)
|
||
if info:
|
||
results.append(info)
|
||
time.sleep(2)
|
||
fetched_count += 1
|
||
|
||
except Exception as e:
|
||
print(f"[错误] {e}")
|
||
traceback.print_exc()
|
||
try:
|
||
if driver.window_handles:
|
||
driver.switch_to.window(driver.window_handles[0])
|
||
except Exception:
|
||
pass
|
||
|
||
# 翻页
|
||
try:
|
||
next_btn = driver.find_element(By.XPATH, "//*[@id='srp-pagination']/li[2]/a/span")
|
||
|
||
if not next_btn.is_enabled() or fetched_count >= limit:
|
||
break
|
||
_scroll_into_view(driver, next_btn)
|
||
try:
|
||
next_btn.click()
|
||
except Exception:
|
||
try:
|
||
ActionChains(driver).move_to_element(next_btn).pause(1).click(next_btn).perform()
|
||
except Exception:
|
||
driver.execute_script("arguments[0].click();", next_btn)
|
||
time.sleep(5)
|
||
except Exception:
|
||
print("[INFO] 已到最后一页或翻页失败")
|
||
break
|
||
|
||
return results
|
||
def scienceDirect(keyword, limit):
|
||
"""主函数:三种排序抓取"""
|
||
driver = create_browser()
|
||
wait = WebDriverWait(driver, 15)
|
||
all_results = {}
|
||
|
||
sortings = {
|
||
"relevance": None,
|
||
"publication_time": "#srp-sorting-options > div > a > span",
|
||
}
|
||
|
||
try:
|
||
driver.get("https://www.sciencedirect.com/")
|
||
|
||
search_input = WebDriverWait(driver, 10).until(
|
||
EC.presence_of_element_located((By.ID, "qs"))
|
||
)
|
||
search_input.send_keys(keyword)
|
||
time.sleep(2)
|
||
search_button = driver.find_element(By.XPATH, '//*[@id="searchbar"]/div/div/form/div[2]/button')
|
||
search_button.click()
|
||
time.sleep(3)
|
||
|
||
for sort_name, css_selector in sortings.items():
|
||
if css_selector:
|
||
try:
|
||
driver.find_element(By.CSS_SELECTOR, css_selector).click()
|
||
time.sleep(5)
|
||
except Exception:
|
||
print(f"[WARN] 点击排序 {sort_name} 失败")
|
||
|
||
results = crawl_current_sort(driver, limit)
|
||
all_results[sort_name] = results
|
||
finally:
|
||
try:
|
||
driver.quit()
|
||
except Exception:
|
||
pass
|
||
print("[DONE] PDF处理完成")
|
||
print(json.dumps(all_results, ensure_ascii=False, indent=2))
|
||
|
||
return json.dumps(all_results, ensure_ascii=False, indent=2)
|
||
|
||
if __name__ == '__main__':
|
||
keyword = "graphrag"
|
||
limit=10
|
||
scienceDirect(keyword,limit)
|