This commit is contained in:
ganweichong 2025-10-12 13:43:20 +08:00
parent f96a95fd18
commit 05425ba9bf
16 changed files with 485 additions and 349 deletions

View File

@ -14,7 +14,10 @@ services:
container_name: selenium-crawl-container container_name: selenium-crawl-container
ports: ports:
- "5001:5000" - "5001:5000"
environment:
API_MODEL: "glm-4.5"
API_BASE_URL: "https://open.bigmodel.cn/api/paas/v4"
API_KEY: "ce39bdd4fcf34ec0aec75072bc9ff988.hAp7HZTVUwy7vImn"
# ---------- Django + Celery ---------- # ---------- Django + Celery ----------
selenium_django: selenium_django:
build: ./selenium_django build: ./selenium_django
@ -24,7 +27,11 @@ services:
CELERY_BROKER_URL: redis://redis:6379/0 CELERY_BROKER_URL: redis://redis:6379/0
CELERY_RESULT_BACKEND: redis://redis:6379/0 CELERY_RESULT_BACKEND: redis://redis:6379/0
# Django 调用爬虫服务的地址 # Django 调用爬虫服务的地址
CRAWL_API_URL: http://47.83.141.164:5001/crawl CRAWL_API_URL: http://47.83.141.164:5001
# API 配置
API_MODEL: "glm-4-long"
API_BASE_URL: "https://open.bigmodel.cn/api/paas/v4"
API_KEY: "ce39bdd4fcf34ec0aec75072bc9ff988.hAp7HZTVUwy7vImn"
volumes: volumes:
- "./selenium_django:/app" - "./selenium_django:/app"
depends_on: depends_on:
@ -39,8 +46,8 @@ services:
context: ./selenium_vue # 上一级目录 context: ./selenium_vue # 上一级目录
dockerfile: Dockerfile dockerfile: Dockerfile
args: args:
VITE_API_BASE_URL: http://47.83.141.164:8002 VITE_API_BASE_URL: http://47.83.141.164:8002 # 改为远程 IP
VITE_CRAWL_URL: http://47.83.141.164:5001/crawl VITE_CRAWL_URL: http://47.83.141.164:5001 # 改为远程 IP
container_name: selenium-vue-container container_name: selenium-vue-container
environment: environment:
PORT: 80 PORT: 80

View File

@ -4,9 +4,9 @@ from selenium import webdriver
from selenium.webdriver.chrome.service import Service from selenium.webdriver.chrome.service import Service
import os import os
api_info = { api_info = {
"model": "glm-4.5", "model": os.environ.get("API_MODEL", "glm-4.5"), # 默认值可选
"base_url": "https://open.bigmodel.cn/api/paas/v4", "base_url": os.environ.get("API_BASE_URL", "https://open.bigmodel.cn/api/paas/v4"),
"api_key": "ce39bdd4fcf34ec0aec75072bc9ff988.hAp7HZTVUwy7vImn" "api_key": os.environ.get("API_KEY", ""),
} }
# chrome浏览器以及驱动配置 # chrome浏览器以及驱动配置

View File

@ -39,26 +39,31 @@ def translate_text(text):
return {"chinese": [], "english": []} return {"chinese": [], "english": []}
# 构造 prompt # 构造 prompt
prompt = ( system_prompt = """你是一名科研检索关键词提炼专家,任务是将用户输入的自然语言直接提炼为学术检索关键词。
"你是科研助手,输入是一句话或中文关键词列表。" 要求
"请从输入中理解语义,提取与科研论文主题最相关、最核心的中文主题,并翻译为英文。" 1. 提炼输入中的核心研究对象问题方法或应用场景
"只保留1~2个最核心主题不要加入无关内容。" 2. 用学术化中文表达避免口语化或宽泛词汇
"输出必须严格遵守 JSON 格式,不允许有额外文字或符号:{\"chinese\": [...], \"english\": [...]}。\n" 3. 给出对应英文表达使用国际学术界常用专业术语
"示例输入输出:\n" 4. 如果输入包含多个研究问题请分别提炼关键词每个字段最多 3 个关键词
"输入: '我想获取基于深度学习的图像识别方面的研究'\n" 5. 删除无关修饰词或无检索价值的词
"输出: {\"chinese\": [\"基于深度学习的图像识别\"], \"english\": [\"Deep Learning-based Image Recognition\"]}\n" 6. 输出严格 JSON 格式仅包含 `chinese` `english` 字段值为列表
"输入: '图像识别在深度学习方面的研究'\n"
"输出: {\"chinese\": [\"基于深度学习的图像识别\"], \"english\": [\"Deep Learning-based Image Recognition\"]}\n" 示例
"输入: '自然语言处理模型在文本分类中的应用'\n" 输入: '我想研究深度强化学习在机器人控制中的应用'
"输出: {\"chinese\": [\"自然语言处理文本分类\"], \"english\": [\"NLP Text Classification\"]}\n" 输出: {"chinese": ["深度强化学习", "机器人控制"], "english": ["Deep Reinforcement Learning", "Robot Control"]}
"输入: '强化学习在自动驾驶决策中的最新进展'\n"
"输出: {\"chinese\": [\"强化学习自动驾驶决策\"], \"english\": [\"Reinforcement Learning for Autonomous Driving Decision-Making\"]}\n" 输入: '大模型多轮对话迷失的问题及解决方案'
"输入: '使用图神经网络进行社交网络分析的研究'\n" 输出: {"chinese": ["大型语言模型", "多轮对话上下文漂移"], "english": ["Large Language Models", "Context Drift in Multi-turn Dialogue"]}
"输出: {\"chinese\": [\"图神经网络社交网络分析\"], \"english\": [\"Graph Neural Networks for Social Network Analysis\"]}\n"
"输入: '我想研究深度强化学习在机器人控制中的应用'\n" 输入: '人工智能幻觉问题及多轮对话迷失的解决方法,包括意图识别工作'
"输出: {\"chinese\": [\"深度强化学习机器人控制\"], \"english\": [\"Deep Reinforcement Learning for Robot Control\"]}\n" 输出: {"chinese": ["人工智能幻觉", "多轮对话上下文漂移", "意图识别"], "english": ["AI Hallucination", "Context Drift in Multi-turn Dialogue", "Intent Recognition"]}
f"现在请对输入提取核心主题:\n输入: {text}"
) 输入: '了解生态系统的能量流动和物种多样性'
输出: {"chinese": ["生态系统能量流动", "物种多样性"], "english": ["Ecosystem Energy Flow", "Species Diversity"]}
"""
user_prompt=f"""输入:{text}
请严格输出符合 JSON 格式的核心科研关键词
"""
url = f"{api_info['base_url']}/chat/completions" url = f"{api_info['base_url']}/chat/completions"
@ -68,12 +73,21 @@ def translate_text(text):
} }
payload = { payload = {
"model": api_info["model"], "model": api_info["model"],
"messages": [{"role": "user", "content": prompt}], "messages": [
{"role": "system", "content": system_prompt},
{"role": "user", "content": user_prompt}
],
"thinking": {
"type": "disabled"
},
"max_output_tokens": 512 "max_output_tokens": 512
} }
try: try:
resp = requests.post(url, headers=headers, json=payload, timeout=30) resp = requests.post(url, headers=headers, json=payload, timeout=60)
resp.raise_for_status() resp.raise_for_status()
result = resp.json() result = resp.json()

View File

@ -25,6 +25,9 @@ async def call_model_api(prompt):
payload = { payload = {
"model": api_info["model"], "model": api_info["model"],
"messages": [{"role": "user", "content": prompt}], "messages": [{"role": "user", "content": prompt}],
"thinking": {
"type": "disabled"
},
"max_output_tokens": 1024 "max_output_tokens": 1024
} }

View File

@ -20,11 +20,6 @@ ENV CELERY_BROKER_URL=${CELERY_BROKER_URL:-redis://redis:6379/0}
ENV CELERY_RESULT_BACKEND=${CELERY_RESULT_BACKEND:-redis://redis:6379/0} ENV CELERY_RESULT_BACKEND=${CELERY_RESULT_BACKEND:-redis://redis:6379/0}
ENV CRAWL_API_URL=${CRAWL_API_URL:-http://47.83.141.164:5001/crawl} ENV CRAWL_API_URL=${CRAWL_API_URL:-http://47.83.141.164:5001/crawl}
# 在构建时替换 settings.py 中的配置
RUN sed -i "s#CELERY_BROKER_URL = .*#CELERY_BROKER_URL = '${CELERY_BROKER_URL}'#" selenium_django/settings.py && \
sed -i "s#CELERY_RESULT_BACKEND = .*#CELERY_RESULT_BACKEND = '${CELERY_RESULT_BACKEND}'#" selenium_django/settings.py && \
sed -i "s#CRAWL_API_URL = .*#CRAWL_API_URL = '${CRAWL_API_URL}'#" selenium_django/settings.py
# 入口脚本 # 入口脚本
COPY entrypoint.sh /entrypoint.sh COPY entrypoint.sh /entrypoint.sh
RUN chmod +x /entrypoint.sh RUN chmod +x /entrypoint.sh

View File

@ -3,14 +3,16 @@ from django.db import models
# Create your models here. # Create your models here.
from django.db import models from django.db import models
class Task(models.Model): class Task(models.Model):
TASK_STATUS_CHOICES = [ TASK_STATUS_CHOICES = [
('running', '进行中'), ('running', '进行中'),
('queued', '进行中'),
('idle', '空闲中'), ('idle', '空闲中'),
('done', '完成'), ('done', '完成'),
('failed', '失败'), ('failed', '失败'),
] ]
EXECUTION_TYPE_CHOICES = [ EXECUTION_TYPE_CHOICES = [
('scheduled', '定期执行'), ('scheduled', '定期执行'),
('predefined', '预定时间执行'), ('predefined', '预定时间执行'),
@ -53,3 +55,14 @@ class TaskDetail(models.Model):
parsed_summary = models.JSONField(blank=True, null=True) # 存储 JSON parsed_summary = models.JSONField(blank=True, null=True) # 存储 JSON
title = models.CharField(max_length=300, blank=True) title = models.CharField(max_length=300, blank=True)
created_at = models.DateTimeField(auto_now_add=True) created_at = models.DateTimeField(auto_now_add=True)
class CrawlQueue(models.Model):
task = models.ForeignKey(Task, on_delete=models.CASCADE, related_name="queue")
texts = models.TextField()
parse_flag = models.BooleanField(default=True)
limit = models.IntegerField(default=10)
sort_options = models.JSONField(default=list)
status = models.CharField(max_length=20, default="pending") # pending / processing / done / failed
created_at = models.DateTimeField(auto_now_add=True)
updated_at = models.DateTimeField(auto_now=True)

View File

@ -2,7 +2,7 @@ from apscheduler.schedulers.background import BackgroundScheduler
from apscheduler.triggers.date import DateTrigger from apscheduler.triggers.date import DateTrigger
from django.utils import timezone from django.utils import timezone
from datetime import datetime, date from datetime import datetime, date
from .models import Task from .models import Task,CrawlQueue
from .tasks import trigger_task_execution from .tasks import trigger_task_execution
import logging import logging
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
@ -25,9 +25,9 @@ def start_scheduler():
scheduler.add_job(sync_scheduled_tasks, 'interval', seconds=30) scheduler.add_job(sync_scheduled_tasks, 'interval', seconds=30)
def check_predefined_tasks(): def check_predefined_tasks():
"""检查一次性任务并触发 Celery 异步执行""" """检查一次性任务并加入队列"""
logger.info("检查一次性任务: 开始") logger.info("检查一次性任务: 开始")
now = datetime.now() # 使用本地时间 now = datetime.now()
tasks = Task.objects.filter(status='idle', execution_type='predefined') tasks = Task.objects.filter(status='idle', execution_type='predefined')
logger.debug(f"[Predefined] 检查 {len(tasks)} 个一次性任务, 当前时间 {now}") logger.debug(f"[Predefined] 检查 {len(tasks)} 个一次性任务, 当前时间 {now}")
@ -37,24 +37,34 @@ def check_predefined_tasks():
logger.warning(f"Task {task.id} 没有设置 execution_time跳过") logger.warning(f"Task {task.id} 没有设置 execution_time跳过")
continue continue
# 数据库里已经是本地时间,不需要再做 timezone aware
if exec_time <= now: if exec_time <= now:
try: try:
# 异步调用 Celery 执行任务,只传 task.id # 排队逻辑
trigger_task_execution.delay(task.id) task.status = 'running' # 前端显示为进行中
logger.info(f"Task {task.id} 已触发 Celery 异步执行")
# 更新任务状态为 done避免重复触发
task.status = 'done'
task.save(update_fields=['status']) task.save(update_fields=['status'])
CrawlQueue.objects.create(
task=task,
texts=task.description,
parse_flag=task.parse_flag,
limit=task.limit,
sort_options=[],
status="pending"
)
process_crawl_queue.delay()
logger.info(f"Task {task.id} 已加入队列")
except Exception as e: except Exception as e:
logger.error(f"触发 Task {task.id} 时出错: {e}") logger.error(f"触发 Task {task.id} 时出错: {e}")
def sync_scheduled_tasks(): def sync_scheduled_tasks():
"""同步每日定时任务到 APScheduler""" """同步每日定时任务到 APScheduler"""
today = date.today() today = date.today()
now = datetime.now() # 本地时间 now = datetime.now()
tasks = Task.objects.filter(status='idle', execution_type='scheduled') tasks = Task.objects.filter(status='idle', execution_type='scheduled')
logger.debug(f"[Scheduled] 检查 {len(tasks)} 个每日任务, 当前时间 {now}") logger.debug(f"[Scheduled] 检查 {len(tasks)} 个每日任务, 当前时间 {now}")
@ -71,10 +81,9 @@ def sync_scheduled_tasks():
last_run = task.last_run_date last_run = task.last_run_date
if last_run != today: if last_run != today:
# 直接用本地时间,不再 make_aware
exec_datetime = datetime.combine(today, scheduled_time_obj) exec_datetime = datetime.combine(today, scheduled_time_obj)
job_id = f"scheduled_task_{task.id}" job_id = f"scheduled_task_{task.id}"
if not scheduler.get_job(job_id): if not scheduler.get_job(job_id):
scheduler.add_job( scheduler.add_job(
run_scheduled_task, run_scheduled_task,
@ -86,7 +95,6 @@ def sync_scheduled_tasks():
) )
def run_scheduled_task(task_id): def run_scheduled_task(task_id):
"""执行每日定时任务""" """执行每日定时任务"""
try: try:
@ -96,9 +104,23 @@ def run_scheduled_task(task_id):
return return
try: try:
trigger_task_execution.delay(task.id) # 排队逻辑
logger.info(f"[Scheduled] Task {task.id} 已触发 Celery 执行") task.status = 'running'
task.save(update_fields=['status'])
CrawlQueue.objects.create(
task=task,
texts=task.description,
parse_flag=task.parse_flag,
limit=task.limit,
sort_options=[],
status="pending"
)
process_crawl_queue.delay()
logger.info(f"[Scheduled] Task {task.id} 已加入队列")
task.last_run_date = date.today() task.last_run_date = date.today()
task.save(update_fields=['last_run_date']) task.save(update_fields=['last_run_date'])
except Exception as e: except Exception as e:
logger.error(f"[Scheduled] 执行 Task {task.id} 出错: {e}") logger.error(f"[Scheduled] 执行 Task {task.id} 出错: {e}")

View File

@ -2,9 +2,9 @@
import requests import requests
from datetime import datetime, date from datetime import datetime, date
from django.db import transaction from django.db import transaction, DatabaseError
from .models import Task, TaskDetail from .models import Task, TaskDetail,CrawlQueue
from django.utils import timezone from django.utils import timezone
import threading import threading
import time import time
@ -18,123 +18,135 @@ def safe_dict_get(d, key, default=None):
return d.get(key, default) return d.get(key, default)
return default return default
@shared_task(bind=True, max_retries=3, default_retry_delay=60) @shared_task(bind=True, queue='crawler', max_retries=3, default_retry_delay=60)
def trigger_task_execution(self, task_id): def trigger_task_execution(self, task_id):
"""异步执行单个任务""" """接收任务 → 入队等待处理"""
task = None
try: try:
# 获取任务
task = Task.objects.get(id=task_id) task = Task.objects.get(id=task_id)
task.status = 'running' # 标记为排队状态
task.status = 'queued'
task.save(update_fields=['status']) task.save(update_fields=['status'])
print(f"任务 {task_id} 状态更新为 running") print(f"任务 {task_id} 状态更新为 queued")
# 爬虫请求 # 将任务存入 CrawlQueue
payload = { CrawlQueue.objects.create(
"texts": task.description, task=task,
"parse": task.parse_flag, texts=task.description,
"limit": task.limit parse_flag=task.parse_flag,
} limit=task.limit,
sort_options=[],
status="pending",
)
print(f"任务 {task_id} 已加入爬虫队列")
try: # 立即触发队列处理任务
resp = requests.post(CRAWL_API_URL, json=payload, timeout=30000) process_crawl_queue.delay()
resp.raise_for_status()
except requests.RequestException as e:
print(f"Task {task_id} 爬虫请求失败: {e}")
raise self.retry(exc=e)
# 安全解析 JSON
try:
data = resp.json()
if not isinstance(data, dict):
print(f"Task {task_id} 返回数据不是字典,用空 dict 代替")
data = {}
except ValueError:
print(f"Task {task_id} 返回非 JSON 数据: {resp.text[:200]}")
data = {}
# code==20000 说明提取失败
if safe_dict_get(data, "code") == 20000:
print(f"Task {task_id} 爬虫返回 code=20000, message={data.get('message')}")
return {"success": False, "message": data.get("message", "提取不到关键词")}
# 保存任务详情
results = safe_dict_get(data, "results", [])
if not isinstance(results, list):
results = []
with transaction.atomic():
for idx, item in enumerate(results, start=1):
if not isinstance(item, dict):
print(f"Task {task_id} results 第 {idx} 个元素不是字典,跳过")
continue
download_val = item.get("download") or 0
try:
download_val = int(download_val)
except (ValueError, TypeError):
download_val = 0
date_val = str(item.get("date")) if item.get("date") else None
author_val = item.get("author")
if isinstance(author_val, list):
author_val = ';'.join(author_val)
elif author_val is None:
author_val = ''
keywords_val = item.get("keywords")
if isinstance(keywords_val, list):
keywords_val = ';'.join(keywords_val)
else:
keywords_val = ''
pdf_url = item.get("pdfUrl") or ''
parsed_summary = item.get("parsed_summary") or {}
quote_val = item.get("quote") or ''
site_val = item.get("site") or ''
source_val = item.get("source") or ''
summary_val = item.get("summary") or ''
title_val = item.get("title") or ''
original_link = item.get("originalLink") or ''
# 保存 TaskDetail单条失败不影响其他条
try:
TaskDetail.objects.get_or_create(
task=task,
original_link=original_link,
defaults={
'author': author_val,
'date': date_val,
'download': download_val,
'keywords': keywords_val,
'pdf_url': pdf_url,
'parsed_summary': parsed_summary,
'quote': quote_val,
'site': site_val,
'source': source_val,
'summary': summary_val,
'title': title_val
}
)
print(f"Task {task_id} 保存第 {idx} 条结果成功")
except Exception as e:
print(f"Task {task_id} 保存第 {idx} 条结果失败: {e}")
continue
# 更新任务状态为 done
task.status = 'done'
task.save(update_fields=['status'])
print(f"任务 {task_id} 执行完成")
except Task.DoesNotExist: except Task.DoesNotExist:
print(f"Task {task_id} 不存在") print(f"Task {task_id} 不存在")
except Exception as e: except Exception as e:
print(f"Task {task_id} 执行失败: {e}") print(f"Task {task_id} 入队失败: {e}")
raise self.retry(exc=e)
@shared_task(bind=True, queue='crawl_worker', max_retries=3, default_retry_delay=60)
def process_crawl_queue(self):
"""
顺序执行队列任务确保一个接着一个执行
"""
item = None
try:
# 获取最早 pending 任务(加锁避免并发)
with transaction.atomic():
item = (
CrawlQueue.objects
.select_for_update(skip_locked=True)
.filter(status='pending')
.order_by('created_at')
.first()
)
if not item:
return "no task"
# 标记队列和任务状态
item.status = 'processing'
item.save(update_fields=['status'])
task = item.task
task.status = 'running'
task.save(update_fields=['status'])
# 事务之外执行网络请求,减少锁表时间
payload = {
"texts": item.texts,
"parse": item.parse_flag,
"limit": item.limit,
"sort": item.sort_options
}
print(f"开始请求爬虫 task_id={task.id}")
resp = requests.post(CRAWL_API_URL, json=payload, timeout=300)
resp.raise_for_status()
try: try:
if task: data = resp.json()
except ValueError:
print(f"Task {task.id} 返回非 JSON 数据: {resp.text[:200]}")
data = {}
results = data.get("results", [])
if not isinstance(results, list):
results = []
# 保存结果,事务保护
with transaction.atomic():
for idx, r in enumerate(results, start=1):
TaskDetail.objects.get_or_create(
task=task,
original_link=r.get("originalLink") or "",
defaults={
"author": ";".join(r.get("author", [])) if isinstance(r.get("author"), list) else (r.get("author") or ""),
"date": str(r.get("date")) if r.get("date") else None,
"download": int(r.get("download") or 0),
"keywords": ";".join(r.get("keywords", [])) if isinstance(r.get("keywords"), list) else (r.get("keywords") or ""),
"pdf_url": r.get("pdfUrl") or "",
"parsed_summary": r.get("parsed_summary") or {},
"quote": r.get("quote") or "",
"site": r.get("site") or "",
"source": r.get("source") or "",
"summary": r.get("summary") or "",
"title": r.get("title") or "",
}
)
print(f"Task {task.id} 保存第 {idx} 条结果成功")
# 标记完成
with transaction.atomic():
task.status = 'done'
task.save(update_fields=['status'])
item.status = 'done'
item.save(update_fields=['status'])
print(f"任务 {task.id} 执行完成")
except requests.RequestException as e:
print(f"网络请求失败 task_id={item.task.id if item else 'N/A'}: {e}")
if item:
with transaction.atomic():
item.status = 'pending'
item.save(update_fields=['status'])
raise self.retry(exc=e)
except DatabaseError as e:
print(f"数据库异常 task_id={item.task.id if item else 'N/A'}: {e}")
raise self.retry(exc=e)
except Exception as e:
print(f"任务执行失败 task_id={item.task.id if item else 'N/A'}: {e}")
if item:
with transaction.atomic():
task.status = 'failed' task.status = 'failed'
task.save(update_fields=['status']) task.save(update_fields=['status'])
except Exception as e2: item.status = 'failed'
print(f"更新任务失败状态失败: {e2}") item.save(update_fields=['status'])
raise self.retry(exc=e) raise self.retry(exc=e)
finally:
# 触发下一个队列任务
process_crawl_queue.apply_async(countdown=1)

View File

@ -10,13 +10,21 @@ from django_filters.rest_framework import DjangoFilterBackend
# Create your views here. # Create your views here.
from rest_framework import viewsets, filters from rest_framework import viewsets, filters
from rest_framework.pagination import PageNumberPagination from rest_framework.pagination import PageNumberPagination
from .models import Task, TaskDetail from .models import Task, TaskDetail,CrawlQueue
from .serializers import TaskSerializer, TaskDetailSerializer, TaskListSerializer from .serializers import TaskSerializer, TaskDetailSerializer, TaskListSerializer
from rest_framework.decorators import action from rest_framework.decorators import action
from rest_framework.response import Response from rest_framework.response import Response
from rest_framework import status from rest_framework import status
from .tasks import trigger_task_execution from .tasks import trigger_task_execution,process_crawl_queue
import threading import threading
import logging
logger = logging.getLogger(__name__)
print(f'----------chat----------init---------')
# 分页设置 # 分页设置
class StandardResultsSetPagination(PageNumberPagination): class StandardResultsSetPagination(PageNumberPagination):
page_size = 10 page_size = 10
@ -36,6 +44,7 @@ def sync_stream(generator):
# 获取异步生成器的下一条数据 # 获取异步生成器的下一条数据
chunk = loop.run_until_complete(async_gen.__anext__()) chunk = loop.run_until_complete(async_gen.__anext__())
if chunk and chunk.strip(): if chunk and chunk.strip():
print(chunk)
yield chunk yield chunk
except StopAsyncIteration: except StopAsyncIteration:
break break
@ -52,6 +61,9 @@ async def call_model_stream(messages):
"model": api_info["model"], "model": api_info["model"],
"messages": messages, "messages": messages,
"max_output_tokens": 1024, "max_output_tokens": 1024,
"thinking": {
"type": "disabled"
},
"stream": True "stream": True
} }
@ -77,6 +89,12 @@ class TaskViewSet(viewsets.ModelViewSet):
ordering_fields = ['created_at', 'updated_at'] ordering_fields = ['created_at', 'updated_at']
def get_serializer_class(self): def get_serializer_class(self):
print(f'----------get_serializer_class-------------------')
print(f'1111111111')
if self.action == 'list': if self.action == 'list':
return TaskListSerializer # list 返回简化字段 return TaskListSerializer # list 返回简化字段
return TaskSerializer # retrieve 返回完整字段,含 details return TaskSerializer # retrieve 返回完整字段,含 details
@ -86,14 +104,26 @@ class TaskViewSet(viewsets.ModelViewSet):
task = self.get_object() task = self.get_object()
try: try:
# 异步触发 Celery 任务 # 标记任务为排队状态(前端显示“进行中”)
async_result = trigger_task_execution.delay(task.id) task.status = 'running' # 前端仍然可理解为“进行中”
task.save(update_fields=['status'])
# 创建队列记录
CrawlQueue.objects.create(
task=task,
texts=task.description,
parse_flag=task.parse_flag,
limit=task.limit,
sort_options=[],
status="pending"
)
# 触发队列处理任务(异步,单 worker 串行执行)
process_crawl_queue.delay()
# 直接返回任务已触发,不访问 async_result 的内容
return Response({ return Response({
"success": True, "success": True,
"task_id": async_result.id, "message": f"任务 {task.id} 已加入队列"
"message": f"任务 {task.id} 已触发"
}, status=status.HTTP_200_OK) }, status=status.HTTP_200_OK)
except Exception as e: except Exception as e:
@ -104,12 +134,23 @@ class TaskViewSet(viewsets.ModelViewSet):
@action(detail=True, methods=['post']) @action(detail=True, methods=['post'])
def chat(self, request, pk=None): def chat(self, request, pk=None):
print(f'----------chat-------------------')
print(f'222222222222222')
task = self.get_object() task = self.get_object()
user_question = request.data.get("question", "") user_question = request.data.get("question", "")
print(f'----chat--------------user_question={user_question}--------------')
if not user_question: if not user_question:
return Response({"success": False, "message": "question 参数不能为空"}, status=400) return Response({"success": False, "message": "question 参数不能为空"}, status=400)
# 构造结构化文档 # 构造结构化文档
print(f'----chat--------------task={task}--------------')
all_docs = TaskDetail.objects.filter(task=task) all_docs = TaskDetail.objects.filter(task=task)
all_docs_list = [] all_docs_list = []
for doc in all_docs: for doc in all_docs:
@ -125,6 +166,9 @@ class TaskViewSet(viewsets.ModelViewSet):
}) })
all_docs_json = json.dumps(all_docs_list, ensure_ascii=False) all_docs_json = json.dumps(all_docs_list, ensure_ascii=False)
print(f'----chat--------------all_docs_json={all_docs_json}--------------')
SYSTEM_PROMPT = """ SYSTEM_PROMPT = """
你是专业文献问答助手请严格根据提供的任务文档回答用户问题 你是专业文献问答助手请严格根据提供的任务文档回答用户问题
任务文档内容已经结构化提供为 JSON 列表每条文档包含字段 任务文档内容已经结构化提供为 JSON 列表每条文档包含字段
@ -144,6 +188,9 @@ class TaskViewSet(viewsets.ModelViewSet):
# 使用 Django 的 StreamingHttpResponse 返回 # 使用 Django 的 StreamingHttpResponse 返回
response = StreamingHttpResponse(sync_stream(call_model_stream(messages)), content_type="text/event-stream") response = StreamingHttpResponse(sync_stream(call_model_stream(messages)), content_type="text/event-stream")
print(f'----chat--------------666666666--------------')
return response return response
from rest_framework import status from rest_framework import status
from rest_framework.response import Response from rest_framework.response import Response
@ -156,6 +203,11 @@ class TaskDetailViewSet(viewsets.ModelViewSet):
search_fields = ['title', 'author', 'site'] search_fields = ['title', 'author', 'site']
def get_queryset(self): def get_queryset(self):
print(f'----------get_queryset-------------------')
print(f'33333333333333')
queryset = super().get_queryset() queryset = super().get_queryset()
task_id = self.request.query_params.get('task') task_id = self.request.query_params.get('task')
if task_id and task_id.isdigit(): if task_id and task_id.isdigit():

Binary file not shown.

View File

@ -1,10 +1,14 @@
#!/bin/bash #!/bin/bash
# entrypoint.sh # entrypoint.sh
# 启动 Celery Worker # 启动 Celery 入队 Worker(可以多线程)
echo "Starting Celery..." echo "Starting Celery crawler queue worker..."
celery -A selenium_django worker -l info --pool=solo & celery -A selenium_django worker -Q crawler -l info --pool=threads -c 4 &
# 启动 Django # 启动 Celery 爬虫处理 Worker顺序执行单线程
echo "Starting Celery crawl_worker (sequential)..."
celery -A selenium_django worker -Q crawl_worker -l info --pool=prefork -c 1 &
# 启动 Django Gunicorn
echo "Starting Django..." echo "Starting Django..."
exec gunicorn selenium_django.wsgi:application --log-level=info --bind 0.0.0.0:8000 exec gunicorn selenium_django.wsgi:application --log-level=info --bind 0.0.0.0:8000

View File

@ -11,26 +11,26 @@ https://docs.djangoproject.com/en/5.2/ref/settings/
""" """
from pathlib import Path from pathlib import Path
import os
# Build paths inside the project like this: BASE_DIR / 'subdir'. # Build paths inside the project like this: BASE_DIR / 'subdir'.
BASE_DIR = Path(__file__).resolve().parent.parent BASE_DIR = Path(__file__).resolve().parent.parent
# Celery 配置 # Celery 配置
CELERY_BROKER_URL = 'redis://redis:6379/0' CELERY_BROKER_URL = os.environ.get("CELERY_BROKER_URL", "redis://redis:6379/0")
CELERY_RESULT_BACKEND = 'redis://redis:6379/0' CELERY_RESULT_BACKEND = os.environ.get("CELERY_RESULT_BACKEND", "redis://redis:6379/0")
CELERY_ACCEPT_CONTENT = ['json'] CELERY_ACCEPT_CONTENT = ['json']
CELERY_TASK_SERIALIZER = 'json' CELERY_TASK_SERIALIZER = 'json'
CELERY_RESULT_SERIALIZER = 'json' CELERY_RESULT_SERIALIZER = 'json'
CELERY_TIMEZONE = 'Asia/Shanghai' # 根据你本地时区调整 CELERY_TIMEZONE = 'Asia/Shanghai' # 根据你本地时区调整
# 爬虫api地址 # 爬虫api地址
CRAWL_API_URL = "http://47.83.141.164:5001/crawl" CRAWL_API_URL = os.environ.get("CRAWL_API_URL", "http://selenium:5000/crawl")
# 模型api配置 # 模型api配置
api_info = { api_info = {
"model": "glm-4.5", "model": os.environ.get("API_MODEL", "glm-4.5"), # 默认值可选
"base_url": "https://open.bigmodel.cn/api/paas/v4", "base_url": os.environ.get("API_BASE_URL", "https://open.bigmodel.cn/api/paas/v4"),
"api_key": "ce39bdd4fcf34ec0aec75072bc9ff988.hAp7HZTVUwy7vImn" "api_key": os.environ.get("API_KEY", ""),
} }
# Quick-start development settings - unsuitable for production # Quick-start development settings - unsuitable for production

View File

@ -4,9 +4,12 @@ FROM node:18-alpine as builder
# 设置工作目录 # 设置工作目录
WORKDIR /app WORKDIR /app
# 设置构建时环境变量 ARG VITE_API_BASE_URL
ARG VITE_API_BASE_URL=${VITE_API_BASE_URL:-http://localhost:8000/api} ARG VITE_CRAWL_URL
ENV VITE_API_BASE_URL=$VITE_API_BASE_URL
# 设置给构建时 Vite
ENV VITE_API_BASE_URL=${VITE_API_BASE_URL}
ENV VITE_CRAWL_URL=${VITE_CRAWL_URL}
# 复制前端代码 # 复制前端代码
COPY frontend-vite/package*.json ./ COPY frontend-vite/package*.json ./

View File

@ -13,5 +13,3 @@
<script type="module" src="/src/main.js"></script> <script type="module" src="/src/main.js"></script>
</body> </body>
</html> </html>

View File

@ -1,8 +1,15 @@
// src/config/API_CONFIG.ts
// API配置 - 支持环境变量参数化 // API配置 - 支持环境变量参数化
export const API_CONFIG = { export const API_CONFIG = {
// 从环境变量获取API基础URL如果没有则使用默认值 // 访问宿主机映射端口,不要用 Docker 服务名
BASE_URL: import.meta.env.VITE_API_BASE_URL || 'http://localhost:8000/api', BASE_URL: import.meta.env.VITE_API_BASE_URL
CRAWL_URL: import.meta.env.VITE_CRAWL_API_URL || 'http://localhost:5000', ? import.meta.env.VITE_API_BASE_URL.replace(/\/$/, "") // 不再拼 /api
: "http://47.83.141.164:8002/api", // 默认远程服务器 API 根路径
CRAWL_URL: import.meta.env.VITE_CRAWL_URL
? import.meta.env.VITE_CRAWL_URL.replace(/\/$/, "")
: "http://47.83.141.164:5001/crawl", // 默认远程爬虫服务
// 超时配置 // 超时配置
TIMEOUT: { TIMEOUT: {
@ -12,38 +19,38 @@ export const API_CONFIG = {
// 请求头配置 // 请求头配置
HEADERS: { HEADERS: {
'Content-Type': 'application/json', "Content-Type": "application/json",
}, },
} };
// 动态配置设置(支持运行时修改) // 动态配置设置(支持运行时修改)
export const settings = { export const settings = {
get baseUrl() { get baseUrl() {
return localStorage.getItem('api_base_url') || API_CONFIG.BASE_URL return localStorage.getItem("api_base_url") || API_CONFIG.BASE_URL;
}, },
set baseUrl(v) { set baseUrl(v) {
localStorage.setItem('api_base_url', (v || '').replace(/\/$/, '')) localStorage.setItem("api_base_url", (v || "").replace(/\/$/, ""));
}, },
get crawlUrl() { get crawlUrl() {
return localStorage.getItem('crawl_url') || API_CONFIG.CRAWL_URL return localStorage.getItem("crawl_url") || API_CONFIG.CRAWL_URL;
}, },
set crawlUrl(v) { set crawlUrl(v) {
localStorage.setItem('crawl_url', (v || '').replace(/\/$/, '')) localStorage.setItem("crawl_url", (v || "").replace(/\/$/, ""));
}, },
} };
// 获取当前环境信息 // 获取当前环境信息(调试用)
export const getEnvironmentInfo = () => ({ export const getEnvironmentInfo = () => ({
NODE_ENV: import.meta.env.NODE_ENV, NODE_ENV: import.meta.env.NODE_ENV,
BASE_URL: import.meta.env.BASE_URL, BASE_URL: import.meta.env.BASE_URL,
API_BASE_URL: import.meta.env.VITE_API_BASE_URL, API_BASE_URL: import.meta.env.VITE_API_BASE_URL,
CRAWL_API_URL: import.meta.env.VITE_CRAWL_API_URL, CRAWL_API_URL: import.meta.env.VITE_CRAWL_URL,
currentBaseUrl: settings.baseUrl, currentBaseUrl: settings.baseUrl,
currentCrawlUrl: settings.crawlUrl, currentCrawlUrl: settings.crawlUrl,
}) });
// 开发环境调试信息 // 开发环境调试信息
if (import.meta.env.DEV) { if (import.meta.env.DEV) {
console.log('🔧 API Configuration:', getEnvironmentInfo()) console.log("🔧 API Configuration:", getEnvironmentInfo());
} }

View File

@ -80,7 +80,7 @@
<div v-for="(it, idx) in filteredItems" :key="it.id || idx" class="panel" style="min-height:180px; display:flex; flex-direction:column; cursor:pointer;" @click="isDone(it.status) ? $router.push(`/tasks/${it.id}`) : null"> <div v-for="(it, idx) in filteredItems" :key="it.id || idx" class="panel" style="min-height:180px; display:flex; flex-direction:column; cursor:pointer;" @click="isDone(it.status) ? $router.push(`/tasks/${it.id}`) : null">
<div style="display:flex; align-items:center; justify-content:space-between; gap:8px;"> <div style="display:flex; align-items:center; justify-content:space-between; gap:8px;">
<div style="font-weight:700; overflow:hidden; text-overflow:ellipsis; white-space:nowrap; max-width:70%;">{{ it.name || '未命名任务' }}</div> <div style="font-weight:700; overflow:hidden; text-overflow:ellipsis; white-space:nowrap; max-width:70%;">{{ it.name || '未命名任务' }}</div>
<span :style="statusPill(it.status)">{{ statusText(it.status) }}</span> <span :style="statusPill(it.status)">{{ statusText(it.status, it.id) }}</span>
</div> </div>
<div style="color:#6b7280; margin-top:4px; font-size:12px;">{{ it.task_id || '无ID' }}</div> <div style="color:#6b7280; margin-top:4px; font-size:12px;">{{ it.task_id || '无ID' }}</div>
<div style="color:#374151; margin-top:8px; flex:1; overflow:auto; font-size:13px;">{{ it.description || '(无描述)' }}</div> <div style="color:#374151; margin-top:8px; flex:1; overflow:auto; font-size:13px;">{{ it.description || '(无描述)' }}</div>
@ -481,9 +481,14 @@ function statusPill(s){
const c = color[s] || '#6b7280' const c = color[s] || '#6b7280'
return { background:bg, color:c, padding:'4px 10px', borderRadius:'999px', fontWeight:'600', fontSize:'12px' } return { background:bg, color:c, padding:'4px 10px', borderRadius:'999px', fontWeight:'600', fontSize:'12px' }
} }
function statusText(s){ function statusText(s, id){
const local = localStatusMap.value[id]
if (local === 'running') return '运行中' //
if (local === 'done') return '完成'
if (local === 'failed') return '失败'
const map = { const map = {
running:'运行中', running:'运行中',
queued:'运行中',
idle:'空闲中', idle:'空闲中',
done:'完成', done:'完成',
failed:'失败', failed:'失败',
@ -601,14 +606,14 @@ async function runTasksConcurrently(tasks) {
await fetchTasks() await fetchTasks()
} }
const localStatusMap = ref({}) // { [taskId]: 'running' | 'done' | null }
// //
async function runNow(task) { async function runNow(task) {
// //
if (runningIds.value.has(task.id)) return if (runningIds.value.has(task.id)) return
runningIds.value.add(task.id) runningIds.value.add(task.id)
localStatusMap.value[task.id] = 'running' //
//
task.status = 'running'
// fetchTasks // fetchTasks
try { try {
@ -650,6 +655,7 @@ async function runNow(task) {
console.log(`任务 ${task.id} 达到终止状态: ${currentTask.status},停止轮询`) console.log(`任务 ${task.id} 达到终止状态: ${currentTask.status},停止轮询`)
clearInterval(interval) clearInterval(interval)
runningIds.value.delete(task.id) runningIds.value.delete(task.id)
localStatusMap.value[task.id] = data.status //
} }
} else { } else {
console.error(`轮询中未找到任务 ${task.id}`) console.error(`轮询中未找到任务 ${task.id}`)