selenium_keyan/selenium/utils/weipu.py

281 lines
9.7 KiB
Python
Executable File

# coding=utf-8
import json
import time
import random
import sys
import os
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
import traceback
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException, NoSuchWindowException, WebDriverException
from selenium.webdriver.common.action_chains import ActionChains
from config import create_browser,_scroll_into_view
def find_valid_detail_tab(driver, origin_handle, timeout=10):
"""从现有句柄中挑出一个不是原始窗口的有效新标签页。"""
end_time = time.time() + timeout
while time.time() < end_time:
for handle in driver.window_handles:
if handle != origin_handle:
try:
driver.switch_to.window(handle)
current_url = driver.current_url
if not current_url.startswith("chrome://") and current_url != "about:blank":
print(f"[维普切换窗口] 成功 → {driver.title}")
return handle
except Exception:
pass
time.sleep(0.5)
raise Exception("未能在规定时间内找到有效详情页窗口")
# ---------主函数 ---------
def extract_row_info(row, driver):
"""抓取单条记录信息并返回字典"""
try:
type_text = row.find_element(By.XPATH, 'td[5]/div/span').text.strip()
except Exception:
type_text = ""
# 如果不是期刊论文,直接跳过
if type_text != "期刊论文":
return None
try:
title_element = row.find_element(By.XPATH, 'td[2]/div/div/a')
title = title_element.text.strip()
print("论文名称:", title)
except Exception:
print("[错误] 标题元素未找到")
return None
try:
author_elems = row.find_elements(
By.XPATH,
".//div[contains(@class,'six-wrap')]//*[@data-warden-event-id='author-click']"
)
authors = [e.text.strip() for e in author_elems if e.text.strip()]
except Exception:
authors = []
print("作者列表:", authors)
try:
source = row.find_element(By.XPATH, "td[4]/div/a").text
except Exception:
source = ""
print("期刊来源:", source)
print("类型:", type_text)
time.sleep(1)
try:
origin = driver.current_window_handle
except NoSuchWindowException:
print("[错误] 当前窗口不存在")
return None
existing_handles = driver.window_handles
# 点击标题打开新窗口
clicked = False
try:
_scroll_into_view(driver, title_element)
title_element.click()
clicked = True
except Exception:
try:
ActionChains(driver).move_to_element(title_element).pause(1).click(title_element).perform()
clicked = True
except Exception:
try:
driver.execute_script("arguments[0].click();", title_element)
clicked = True
except Exception:
print("[错误] 点击标题失败")
clicked = False
if not clicked:
return None
try:
WebDriverWait(driver, 10).until(EC.new_window_is_opened(existing_handles))
except TimeoutException:
print("[警告] 未检测到新窗口,跳过")
return None
try:
# 获取新窗口句柄
new_handles = driver.window_handles
detail_tab = next((h for h in new_handles if h != origin), None)
if not detail_tab:
print("[警告] 找不到新窗口")
return None
driver.switch_to.window(detail_tab)
time.sleep(1)
try:
originalLink = driver.current_url
print("详情页链接:", originalLink)
except NoSuchWindowException:
print("[错误] 新窗口已关闭")
return None
# 获取摘要
summary_text = ""
try:
abstract_elems = driver.find_elements(By.CSS_SELECTOR, "span.ellipsis.content-text")
if abstract_elems:
summary_text = abstract_elems[0].text.strip()
else:
print("[警告] 摘要信息未找到")
except Exception:
summary_text = ""
print("摘要:", summary_text)
# 获取关键词
keywords = []
try:
keyword_container = driver.find_element(By.XPATH, "//div[contains(., '关键词')]")
keyword_spans = keyword_container.find_elements(By.CSS_SELECTOR, "span.select_hover.pointer span")
keywords = [k.text.strip() for k in keyword_spans if k.text.strip()]
except Exception:
keywords = []
print("关键词列表:", keywords)
time.sleep(1)
except (NoSuchWindowException, WebDriverException):
print("[警告] 窗口操作失败")
return None
finally:
# 安全关闭新窗口
try:
if driver.current_window_handle != origin:
driver.close()
driver.switch_to.window(origin)
time.sleep(random.uniform(0.5, 1.0))
except (NoSuchWindowException, WebDriverException):
print("[警告] 无法切回原窗口")
return {
"title": title,
"author": authors,
"source": source,
"site":"维普",
"keywords": keywords,
"originalLink": originalLink if 'originalLink' in locals() else "",
"summary": summary_text
}
def crawl_current_sort(driver, limit):
"""抓取当前排序下的 limit 条记录"""
fetched_count = 0
results = []
while fetched_count < limit:
try:
WebDriverWait(driver, 10).until(
EC.presence_of_element_located((By.CSS_SELECTOR, '#search_container > div.s-list > div.yx-start.content.al-str'))
)
except TimeoutException:
print("[警告] 本页结果表格未出现,尝试继续")
time.sleep(2)
rows = driver.find_elements(By.XPATH, '//*[@id="search_container"]/div[2]/div[2]/div/div/div[3]/table/tbody/tr')
for row in rows:
if fetched_count >= limit:
break
try:
info = extract_row_info(row, driver)
if info:
results.append(info)
time.sleep(2)
fetched_count += 1
except Exception as e:
print(f"[错误] {e}")
traceback.print_exc()
try:
if driver.window_handles:
driver.switch_to.window(driver.window_handles[0])
except Exception:
pass
# 翻页
try:
next_btn = driver.find_element(By.CSS_SELECTOR, "i.el-icon-arrow-right.pointer")
if not next_btn.is_enabled() or fetched_count >= limit:
break
_scroll_into_view(driver, next_btn)
try:
next_btn.click()
except Exception:
try:
ActionChains(driver).move_to_element(next_btn).pause(0.1).click(next_btn).perform()
except Exception:
driver.execute_script("arguments[0].click();", next_btn)
time.sleep(5)
except Exception:
print("[INFO] 已到最后一页或翻页失败")
break
return results
def weipu(keyword, limit,sort_options=None):
"""主函数:三种排序抓取"""
driver = create_browser()
wait = WebDriverWait(driver, 15)
all_results = {}
if not sort_options:
sort_options = ["relevance"] # 默认相关性
try:
driver.get("https://www.cqvip.com/")
search_input = driver.find_element(By.XPATH, "//input[@placeholder='请输入检索词']")
search_input.send_keys(keyword)
time.sleep(2)
search_button = driver.find_element(By.XPATH, "//button[.//span[contains(text(),'检索')]]")
search_button.click()
time.sleep(3)
#切换展示模式
element = WebDriverWait(driver, 10).until(
EC.element_to_be_clickable(
(By.CSS_SELECTOR, 'i[data-warden-event-id="list-arrange"]')
)
)
driver.execute_script("arguments[0].scrollIntoView({block: 'center'});", element)
time.sleep(2)
element.click()
time.sleep(2)
for sort_name in sort_options:
if sort_name == "relevance":
print("[INFO] 使用相关性排序(默认)")
elif sort_name == "download_count":
print("[INFO] 使用被引量排序")
try:
driver.find_element(By.XPATH, '//span[contains(text(),"被引量")]').click()
except Exception:
print(f"[WARN] 点击排序 {sort_name} 失败")
elif sort_name == "publication_time":
print("[INFO] 使用时间排序")
try:
driver.find_element(By.XPATH, '//span[contains(text(),"时效性")]').click()
except Exception:
print(f"[WARN] 点击排序 {sort_name} 失败")
time.sleep(1)
results = crawl_current_sort(driver, limit)
all_results[sort_name] = results
finally:
try:
driver.quit()
except Exception:
pass
print("[DONE] PDF处理完成")
print(json.dumps(all_results, ensure_ascii=False, indent=2))
return all_results
if __name__ == '__main__':
keyword = "深度学习"
limit=10
weipu(keyword,limit,["relevance"])