656 lines
23 KiB
Python
656 lines
23 KiB
Python
#!/usr/bin/env python
|
||
|
||
# Licensed to the Apache Software Foundation (ASF) under one
|
||
# or more contributor license agreements. See the NOTICE file
|
||
# distributed with this work for additional information
|
||
# regarding copyright ownership. The ASF licenses this file
|
||
# to you under the Apache License, Version 2.0 (the
|
||
# "License"); you may not use this file except in compliance
|
||
# with the License. You may obtain a copy of the License at
|
||
#
|
||
# http://www.apache.org/licenses/LICENSE-2.0
|
||
#
|
||
# Unless required by applicable law or agreed to in writing,
|
||
# software distributed under the License is distributed on an
|
||
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||
# KIND, either express or implied. See the License for the
|
||
# specific language governing permissions and limitations
|
||
# under the License.
|
||
|
||
import re
|
||
import json
|
||
import time
|
||
import random
|
||
import requests
|
||
from bs4 import NavigableString, BeautifulSoup as bs
|
||
from markitdown import MarkItDown
|
||
|
||
|
||
inline_tags = ['a', 'img', 'b', 'strong', 'em', 'i', 'code', 'del']
|
||
# block_tags = ['h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'p', 'hr', 'blockquote', 'pre']
|
||
|
||
block_map = {
|
||
'normal': {
|
||
'h1': '\n# {}\n',
|
||
'h2': '\n## {}\n',
|
||
'h3': '\n### {}\n',
|
||
'h4': '\n#### {}\n',
|
||
'h5': '\n##### {}\n',
|
||
'h6': '\n###### {}\n',
|
||
'hr': '\n\n---\n',
|
||
'div': '{}',
|
||
'article': '{}',
|
||
'section': '{}'
|
||
},
|
||
'intent': {
|
||
'p': '\n{}{}\n',
|
||
'blockquote': '{}> {}',
|
||
'pre': '\n{}```{}\n{}\n{}```\n'
|
||
}
|
||
}
|
||
|
||
inline_map = {
|
||
'normal': {
|
||
'i': '*{}*',
|
||
'em': '*{}*',
|
||
'b': '**{}**',
|
||
'strong': '**{}**',
|
||
'del': '~~{}~~',
|
||
"code": '`{}`'
|
||
},
|
||
'link': {
|
||
'a': '[{}]({})',
|
||
'img': ''
|
||
}
|
||
}
|
||
|
||
|
||
# 从 html 中提取纯文本
|
||
def get_text_from_html(html_text, is_file=False):
|
||
html_content = html_text
|
||
# 如果是文件的话, 从文件读取数据
|
||
if is_file:
|
||
with open(html_text, "r", encoding="utf-8") as file_obj:
|
||
html_content = file_obj.read()
|
||
# 将HTML页面转换为BeautifulSoup对象
|
||
soup = bs(html_content, "html.parser")
|
||
# 获取 text 文本内容
|
||
raw_text = soup.get_text()
|
||
return raw_text
|
||
|
||
|
||
# 根据 url 抓取软文数据, 转为 markdown 格式输出
|
||
def url_to_markdown(url):
|
||
title = ""
|
||
md = ""
|
||
try:
|
||
headers = {
|
||
"Accept": "application/json, text/plain, */*",
|
||
"Accept-Encoding": "gzip, deflate, br",
|
||
"Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6",
|
||
"Connection": "keep-alive",
|
||
"Sec-Ch-Ua": '"Not A(Brand";v="99", "Microsoft Edge";v="121", "Chromium";v="121"',
|
||
"Sec-Ch-Ua-Mobile": "?0",
|
||
"Sec-Ch-Ua-Platform": '"Windows"',
|
||
"Sec-Fetch-Dest": 'empty',
|
||
"Sec-Fetch-Mode": 'cors',
|
||
"Sec-Fetch-Site": 'same-origin',
|
||
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36 Edg/121.0.0.0"
|
||
}
|
||
# 获取 html 数据
|
||
res = requests.get(url, headers=headers)
|
||
html_content = res.text
|
||
# 提取 title
|
||
soup = bs(html_content, 'html.parser')
|
||
title = soup.title.string
|
||
# 提取并转换为 markdown
|
||
html_md = convert(html_content)
|
||
# 再过滤一遍 html 内容
|
||
md = get_text_from_html(html_md)
|
||
# print(md)
|
||
# with open('test.md', "w", encoding="utf-8") as file_obj:
|
||
# file_obj.write(md)
|
||
except Exception as e:
|
||
print(f'--url_to_markdown--> 爬取异常报错:e={e}, url={url}')
|
||
# 返回
|
||
return (title, md)
|
||
|
||
|
||
# 根据 url 抓取软文数据
|
||
def url_to_raw_html(url):
|
||
html_content = ""
|
||
try:
|
||
headers = {
|
||
"Accept": "application/json, text/plain, */*",
|
||
"Accept-Encoding": "gzip, deflate, br",
|
||
"Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6",
|
||
"Connection": "keep-alive",
|
||
"Sec-Ch-Ua": '"Not A(Brand";v="99", "Microsoft Edge";v="121", "Chromium";v="121"',
|
||
"Sec-Ch-Ua-Mobile": "?0",
|
||
"Sec-Ch-Ua-Platform": '"Windows"',
|
||
"Sec-Fetch-Dest": 'empty',
|
||
"Sec-Fetch-Mode": 'cors',
|
||
"Sec-Fetch-Site": 'same-origin',
|
||
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36 Edg/121.0.0.0"
|
||
}
|
||
# 获取 html 数据
|
||
res = requests.get(url, headers=headers)
|
||
html_content = res.content
|
||
except Exception as e:
|
||
print(f'--url_to_raw_html--> 爬取异常报错:e={e}, url={url}')
|
||
# 返回
|
||
return html_content
|
||
|
||
|
||
# 根据 html 软文数据, 转为 markdown 格式输出
|
||
def html_to_markdown(html_content):
|
||
title = ""
|
||
md = ""
|
||
try:
|
||
# 提取 title
|
||
soup = bs(html_content, 'html.parser')
|
||
title = soup.title.string
|
||
# 提取并转换为 markdown
|
||
html_md = convert(html_content)
|
||
# 再过滤一遍 html 内容
|
||
md = get_text_from_html(html_md)
|
||
# print(md)
|
||
# with open('test.md', "w", encoding="utf-8") as file_obj:
|
||
# file_obj.write(md)
|
||
except Exception as e:
|
||
pass
|
||
return (title, md)
|
||
|
||
|
||
def cnblogs_list(user_name):
|
||
headers = {
|
||
"Accept": "application/json, text/plain, */*",
|
||
"Accept-Encoding": "gzip, deflate, br",
|
||
"Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6",
|
||
"Connection": "keep-alive",
|
||
"Cookie": "",
|
||
"Host": "",
|
||
"Referer": f"",
|
||
"Sec-Ch-Ua": '"Not A(Brand";v="99", "Microsoft Edge";v="121", "Chromium";v="121"',
|
||
"Sec-Ch-Ua-Mobile": "?0",
|
||
"Sec-Ch-Ua-Platform": '"Windows"',
|
||
"Sec-Fetch-Dest": 'empty',
|
||
"Sec-Fetch-Mode": 'cors',
|
||
"Sec-Fetch-Site": 'same-origin',
|
||
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36 Edg/121.0.0.0"
|
||
}
|
||
# 指定目标博客的 URL 地址
|
||
url = 'https://www.cnblogs.com/'+user_name+'/?page=1'
|
||
# 请求获取 HTML 内容
|
||
response = requests.get(url, headers=headers)
|
||
# 将 HTML 内容转化为 BeautifulSoup 对象
|
||
soup = bs(response.text, 'html.parser')
|
||
page_num = 1
|
||
# 使用select()选择分页信息中的所有页码元素
|
||
page_lists = soup.select('div.pager a')
|
||
for a_element in page_lists:
|
||
numbers = re.findall(r'\d+', a_element.text)
|
||
if numbers:
|
||
number = int(numbers[0]) # 将提取到的数字字符串转换为整数
|
||
if page_num < number:
|
||
page_num = number
|
||
paper_list=[]
|
||
articles = soup.select('div.day div.postTitle a')
|
||
for article in articles:
|
||
href =article['href']
|
||
# print(href)
|
||
paper_list.append(href)
|
||
if page_num>1:
|
||
random_wait_time = random.randint(1, 2)
|
||
time.sleep(random_wait_time)
|
||
for i in range(2,page_num+1):
|
||
url = 'https://www.cnblogs.com/'+user_name+'/?page='+str(i)
|
||
response = requests.get(url, headers=headers)
|
||
#print(response.status_code)
|
||
soup = bs(response.text, 'html.parser')
|
||
articles = soup.select('div.day div.postTitle a')
|
||
for article in articles:
|
||
href = article['href']
|
||
# print(href)
|
||
paper_list.append(href)
|
||
return paper_list
|
||
|
||
def csdn_list(user_name):
|
||
headers = {
|
||
"Accept": "application/json, text/plain, */*",
|
||
"Accept-Encoding": "gzip, deflate, br",
|
||
"Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6",
|
||
"Connection": "keep-alive",
|
||
"Cookie": "",
|
||
"Host": "blog.csdn.net",
|
||
"Referer": f"https://blog.csdn.net/{user_name}?type=blog",
|
||
"Sec-Ch-Ua": '"Not A(Brand";v="99", "Microsoft Edge";v="121", "Chromium";v="121"',
|
||
"Sec-Ch-Ua-Mobile": "?0",
|
||
"Sec-Ch-Ua-Platform": '"Windows"',
|
||
"Sec-Fetch-Dest": 'empty',
|
||
"Sec-Fetch-Mode": 'cors',
|
||
"Sec-Fetch-Site": 'same-origin',
|
||
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36 Edg/121.0.0.0"
|
||
}
|
||
# 指定目标博客的 URL 地址
|
||
url = 'https://blog.csdn.net/community/home-api/v1/get-tab-total?username=' + user_name
|
||
# 请求获取 HTML 内容
|
||
response = requests.get(url, headers=headers)
|
||
if response.status_code == 200:
|
||
data = response.json()
|
||
article_num = data['data']['blog']
|
||
# print(article_num)
|
||
page_num=article_num // 20 # 进行整除
|
||
remainder = article_num % 20 # 获取余数
|
||
if remainder > 0:
|
||
page_num += 1 # 如果余数大于0,则多取一个
|
||
paper_list=[]
|
||
for i in range(1, page_num + 1):
|
||
url = 'https://blog.csdn.net/community/home-api/v1/get-business-list?page='+ str(i)+'&size=20&businessType=blog&orderby=&noMore=false&year=&month=&username=' + user_name
|
||
response = requests.get(url, headers=headers)
|
||
if response.status_code == 200:
|
||
data = response.json()
|
||
for item in data['data']['list']:
|
||
href = item['url']
|
||
# print(href)
|
||
paper_list.append(href)
|
||
return paper_list
|
||
|
||
def zhihu_list(user_name):
|
||
headers = {
|
||
"Accept": "application/json, text/plain, */*",
|
||
"Accept-Encoding": "gzip, deflate, br",
|
||
"Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6",
|
||
"Connection": "keep-alive",
|
||
"Cookie": "",
|
||
"Host": "",
|
||
"Referer": f"",
|
||
"Sec-Ch-Ua": '"Not A(Brand";v="99", "Microsoft Edge";v="121", "Chromium";v="121"',
|
||
"Sec-Ch-Ua-Mobile": "?0",
|
||
"Sec-Ch-Ua-Platform": '"Windows"',
|
||
"Sec-Fetch-Dest": 'empty',
|
||
"Sec-Fetch-Mode": 'cors',
|
||
"Sec-Fetch-Site": 'same-origin',
|
||
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36 Edg/121.0.0.0"
|
||
}
|
||
# 指定目标博客的 URL 地址
|
||
url = 'https://www.zhihu.com/people/'+user_name+'/posts?page=1'
|
||
# 请求获取 HTML 内容
|
||
response = requests.get(url, headers=headers)
|
||
# 将 HTML 内容转化为 BeautifulSoup 对象
|
||
soup = bs(response.text, 'html.parser')
|
||
page_num = 1
|
||
# 使用select()选择分页信息中的所有页码元素
|
||
page_lists = soup.select('div.Pagination button')
|
||
for a_element in page_lists:
|
||
numbers = re.findall(r'\d+', a_element.text)
|
||
if numbers:
|
||
number = int(numbers[0]) # 将提取到的数字字符串转换为整数
|
||
if page_num < number:
|
||
page_num = number
|
||
paper_list=[]
|
||
articles = soup.select('h2.ContentItem-title span a')
|
||
for article in articles:
|
||
href =article['href']
|
||
# print(href)
|
||
paper_list.append(href)
|
||
if page_num>1:
|
||
random_wait_time = random.randint(1, 2)
|
||
time.sleep(random_wait_time)
|
||
for i in range(2,page_num+1):
|
||
url = 'https://www.zhihu.com/people/'+user_name+'/posts?page='+str(i)
|
||
response = requests.get(url, headers=headers)
|
||
# print(response.status_code)
|
||
soup = bs(response.text, 'html.parser')
|
||
articles = soup.select('h2.ContentItem-title span a')
|
||
for article in articles:
|
||
href = article['href']
|
||
# print(href)
|
||
paper_list.append(href)
|
||
return paper_list
|
||
|
||
#cnblogs
|
||
#user_name = 'lingwang3'
|
||
#blog_name = 'cnblogs'
|
||
#cnblogs_list(user_name)
|
||
|
||
#csdn
|
||
#user_name = 'weixin_40340586'
|
||
#blog_name = 'csdn'
|
||
#csdn_list(user_name)
|
||
|
||
#zhihu ye-chi-4-96
|
||
# user_name = '--89-68-45'
|
||
# blog_name = 'zhihu'
|
||
# zhihu_list(user_name)
|
||
|
||
|
||
def convert(html):
|
||
soup = bs(html, 'html.parser')
|
||
# CSDN - #content_views
|
||
# cnblogs - #cnblogs_post_body
|
||
# zhihu - .Post-RichTextContainer
|
||
container = soup.select_one('#content_views') \
|
||
or soup.select_one('#cnblogs_post_body') \
|
||
or soup.select_one('.Post-RichTextContainer') \
|
||
or soup.select_one('article .post-container') \
|
||
or soup.select_one('article #content') \
|
||
or soup.select_one('.page-article .main-content .article-main .article-content') \
|
||
or soup.select_one('article') \
|
||
or soup.select_one('body') \
|
||
or soup
|
||
return __print_tree(container)
|
||
|
||
def __print_tree(ele, intent = 0, md = ''):
|
||
"""递归遍历DOM,为了开发时间暂时就用递归了
|
||
|
||
Arguments:
|
||
ele {bs} -- 待解析元素
|
||
|
||
Keyword Arguments:
|
||
intent {int} -- 缩进值 (default: {0})
|
||
md {str} -- 转换后的文档 (default: {''})
|
||
|
||
Returns:
|
||
str -- 转换后的文档
|
||
"""
|
||
if isinstance(ele, NavigableString):
|
||
md = __transform_text(ele, md)
|
||
elif ele.name == 'img':
|
||
md = __transform_img(ele, md)
|
||
elif ele.name == 'a':
|
||
md = __transform_a(ele, md, intent)
|
||
elif ele.name in inline_map['normal'].keys():
|
||
md = __transform_inline_tags(ele, md, intent)
|
||
elif ele.name == 'pre':
|
||
md = __transform_pre(ele, md, intent)
|
||
elif ele.name in ('ul', 'ol'):
|
||
md = __transform_list_tags(ele, md, intent)
|
||
elif ele.name in block_map['normal'].keys():
|
||
md = __transform_block_normal_tags(ele, md, intent)
|
||
elif ele.name in block_map['intent'].keys():
|
||
md = __transform_block_intent_tags(ele, md, intent)
|
||
elif ele.name == '[document]':
|
||
md = __transform_soup(ele, md, intent)
|
||
else:
|
||
md = __transform_other_tags(ele, md, intent)
|
||
|
||
return md
|
||
|
||
def __transform_text(ele, md):
|
||
text = re.compile(r'[\s]+').sub(' ', ele.string)
|
||
text = text if ele.previous_sibling and ele.previous_sibling.name in inline_tags else text.lstrip()
|
||
text = text if ele.next_sibling and ele.next_sibling.name in inline_tags else text.rstrip()
|
||
md += text
|
||
|
||
return md
|
||
|
||
def __transform_img(ele, md):
|
||
md += inline_map['link']['img'].format(ele.get('alt') or '', ele.get('src') or '')
|
||
|
||
return md
|
||
|
||
def __transform_a(ele, md, intent):
|
||
a_inner = ''
|
||
for child in ele.children:
|
||
a_inner = __print_tree(child, intent, a_inner)
|
||
|
||
if a_inner != '':
|
||
md += inline_map['link']['a'].format(a_inner, ele.get('href') or ele.get_text(strip=True))
|
||
|
||
return md
|
||
|
||
def __transform_pre(ele, md, intent):
|
||
lang_tag = ele.find(class_='hljs')
|
||
if lang_tag: lang_tag['class'].remove('hljs')
|
||
lang = ''.join(lang_tag['class']) if lang_tag else ''
|
||
md += block_map['intent']['pre'].format(' ' * intent, lang, ele.text.strip().replace('\n', '\n' + ' ' * intent), ' ' * intent)
|
||
|
||
return md
|
||
|
||
def __transform_inline_tags(ele, md, intent):
|
||
inline_tag_inner = ''
|
||
for child in ele.children:
|
||
inline_tag_inner = __print_tree(child, intent, inline_tag_inner)
|
||
if inline_tag_inner:
|
||
md += inline_map['normal'][ele.name].format(inline_tag_inner)
|
||
|
||
return md
|
||
|
||
def __transform_block_normal_tags(ele, md, intent):
|
||
block_tag_inner = ''
|
||
for child in ele.children:
|
||
block_tag_inner = __print_tree(child, intent, block_tag_inner)
|
||
md += block_map['normal'][ele.name].format(block_tag_inner)
|
||
|
||
return md
|
||
|
||
def __transform_block_intent_tags(ele, md, intent):
|
||
block_tag_inner = ''
|
||
tpl = block_map['intent'][ele.name]
|
||
prev = ' ' * intent
|
||
|
||
if ele.parent.name == 'blockquote':
|
||
prev = ele.parent['data-prev']
|
||
ele['data-prev'] = ele.parent['data-prev'] + '> '
|
||
tpl = ele.parent['data-prev'] + '\n' + tpl + '\n'
|
||
elif ele.name == 'blockquote':
|
||
tpl = '\n' + tpl + '\n'
|
||
ele['data-prev'] = ' ' * intent + '> '
|
||
|
||
for child in ele.children:
|
||
block_tag_inner = __print_tree(child, intent, block_tag_inner)
|
||
|
||
tpl = __fill_newline_if_need(ele, tpl)
|
||
md += tpl.format(prev, block_tag_inner)
|
||
|
||
return md
|
||
|
||
def __transform_other_tags(ele, md, intent):
|
||
other_inner = ''
|
||
for child in ele.children:
|
||
other_inner = __print_tree(child, intent, other_inner)
|
||
|
||
ele.clear()
|
||
ele.append('{}')
|
||
md += ele.decode().format(other_inner)
|
||
|
||
return md
|
||
|
||
def __transform_list_tags(ele, md, intent):
|
||
list_text = '\n'
|
||
if ele.find_parent(re.compile('[ou]l')): intent += 4
|
||
|
||
line_head = '* ' if ele.name == 'ul' else '{}. '
|
||
for i, e in enumerate(ele.find_all('li', recursive=False)):
|
||
li_inner = ''
|
||
for child in e.children:
|
||
li_inner = __print_tree(child, intent, li_inner)
|
||
list_text += ' ' * intent + line_head.format(i + 1) + li_inner.lstrip() + '\n'
|
||
|
||
md += __fill_newline_if_need(ele, list_text) if list_text.strip() != '' else ''
|
||
|
||
return md
|
||
|
||
def __transform_soup(ele, md, intent):
|
||
for child in ele.children:
|
||
md = __print_tree(child, intent, md)
|
||
|
||
return md
|
||
|
||
def __fill_newline_if_need(ele, text):
|
||
if ele.next_sibling and ele.next_sibling.name in inline_map['normal'].keys() \
|
||
or isinstance(ele.next_sibling, NavigableString) and ele.next_sibling.string.strip() != '':
|
||
text += '\n'
|
||
|
||
if ele.previous_sibling and ele.previous_sibling in inline_map['normal'].keys()\
|
||
or isinstance(ele.previous_sibling, NavigableString) and ele.previous_sibling.string.strip() != '':
|
||
text = '\n' + text
|
||
|
||
return text
|
||
|
||
|
||
#########################################################################################
|
||
# 从 document rich 文档中提取 html 文本
|
||
def recursion_json_to_html(data):
|
||
html_data = ""
|
||
# 元素
|
||
if (("type" in data) and data["type"]):
|
||
# 标签创建
|
||
if (("style" in data) and data["style"]):
|
||
# html_data = html_data + "<" + data["type"] + " style=\"" + data["style"] + "\"" + ">";
|
||
pass
|
||
else:
|
||
# html_data = html_data + "<" + data["type"] + ">";
|
||
pass
|
||
# 子元素
|
||
if ("children" in data):
|
||
for children_item in data["children"]:
|
||
tmp_data = recursion_json_to_html(children_item)
|
||
html_data = html_data + tmp_data
|
||
# 标签结束
|
||
# html_data = html_data + "</" + data["type"] + ">";
|
||
# 内容
|
||
if (("text" in data) and data["text"]):
|
||
html_data = html_data + data["text"]
|
||
# 返回
|
||
return html_data
|
||
|
||
# 从 document rich 文档中提取 html 文本
|
||
def json_to_html(json_str):
|
||
html_data = ""
|
||
data = json.loads(json_str)
|
||
# 递归转换
|
||
html_data = recursion_json_to_html(data)
|
||
# 返回
|
||
return html_data
|
||
|
||
|
||
# 从 json 中提取 text
|
||
def json_to_text(json_str):
|
||
# 从 document rich 文档中提取 html 文本
|
||
html_content = json_to_html(json_str)
|
||
# html 提取 text
|
||
text_content = get_text_from_html(html_content)
|
||
return text_content
|
||
|
||
|
||
# AI Rich Editor 转 TXT 文本
|
||
def ai_rich_json_to_text(json_str):
|
||
content = ""
|
||
if (json_str):
|
||
data = json.loads(json_str)
|
||
if ("doc" == data.get("type", None)):
|
||
for item_i in data.get("content"):
|
||
tmp_item_i_content = item_i.get("content", None)
|
||
if (tmp_item_i_content):
|
||
for item_j in tmp_item_i_content:
|
||
if ("text" == item_j.get("type", None)):
|
||
tmp_text = item_j.get("text", "")
|
||
content = content + tmp_text
|
||
# 返回
|
||
return content
|
||
|
||
|
||
# Excel --> markdown
|
||
def excel_to_markdown(file_path):
|
||
|
||
print(f'====================excel_to_markdown====1==================')
|
||
|
||
md = MarkItDown()
|
||
result = md.convert(file_path)
|
||
|
||
print(result.text_content)
|
||
|
||
return result.text_content
|
||
|
||
|
||
def get_hotnew_url(url):
|
||
yaowen_url = []
|
||
rebang_url = []
|
||
jishi_url = []
|
||
# 获取网页内容
|
||
html = url_to_raw_html(url)
|
||
html_other = url_to_raw_html(url + "/scroll-news/news1.html")
|
||
# 将 HTML 内容转化为 BeautifulSoup 对象
|
||
soup = bs(html, 'html.parser')
|
||
soup_other = bs(html_other, 'html.parser')
|
||
# 使用select()选择分页信息中的所有页码元素
|
||
yaowen_lists = soup.select('.news-left .ywjx-news-list ul li a') # 要闻
|
||
rebang_lists = soup.select('.news-left .rdph-list.rdph-list2 ul li a') # 中新热榜
|
||
jishi_lists = soup_other.select('.content-left .content_list ul li .dd_bt a') # 即时新闻
|
||
# 要闻
|
||
for item in yaowen_lists:
|
||
tmp_url = item["href"]
|
||
if ("://" in tmp_url):
|
||
pass
|
||
elif ("//" in tmp_url):
|
||
tmp_url = "https:" + tmp_url
|
||
else:
|
||
tmp_url = url + tmp_url
|
||
# 添加
|
||
yaowen_url.append(tmp_url)
|
||
# 中新热榜
|
||
for item in rebang_lists:
|
||
tmp_url = item["href"]
|
||
if ("://" in tmp_url):
|
||
pass
|
||
elif ("//" in tmp_url):
|
||
tmp_url = "https:" + tmp_url
|
||
else:
|
||
tmp_url = url + tmp_url
|
||
# 添加
|
||
rebang_url.append(tmp_url)
|
||
# 即时新闻
|
||
for item in jishi_lists:
|
||
tmp_url = item["href"]
|
||
if ("://" in tmp_url):
|
||
pass
|
||
elif ("//" in tmp_url):
|
||
tmp_url = "https:" + tmp_url
|
||
else:
|
||
tmp_url = url + tmp_url
|
||
# 添加
|
||
jishi_url.append(tmp_url)
|
||
|
||
# print(yaowen_url)
|
||
# print(f'----------------------------------------------------')
|
||
# print(rebang_url)
|
||
# print(f'----------------------------------------------------')
|
||
# print(jishi_url)
|
||
|
||
return (yaowen_url, rebang_url, jishi_url)
|
||
|
||
def get_hotnews_content(url):
|
||
title = ""
|
||
datetime = ""
|
||
content = ""
|
||
# 获取网页内容
|
||
html = url_to_raw_html(url)
|
||
# 将 HTML 内容转化为 BeautifulSoup 对象
|
||
soup = bs(html, 'html.parser')
|
||
# 使用select()选择分页信息中的所有页码元素
|
||
title_element = soup.select('.content .content_maincontent_more h1') # 标题
|
||
datetime_element = soup.select('.content .content_maincontent_more .content_left_time') # 日期+时间
|
||
content_element = soup.select('.content .content_maincontent_more .content_maincontent_content .left_zw') # 内容
|
||
# 获取标题
|
||
for item in title_element:
|
||
title = item.text if item.text else ""
|
||
# 获取日期时间(2025-04-08 02:55)
|
||
for item in datetime_element:
|
||
# item.text --> 2025年04月07日 23:55 来源:
|
||
datetime = item.contents[0] if (item.contents and (0<len(item.contents))) else ""
|
||
datetime = datetime.replace(" ", "").replace(" 来源:", "").replace("\r", "").replace("\n", "").replace("年", "-").replace("月", "-").replace("日", " ")
|
||
datetime = datetime[0:16]
|
||
# 获取内容
|
||
for item in content_element:
|
||
content = item.text if item.text else ""
|
||
|
||
# print(title)
|
||
# print(f'----------------------------------------------------')
|
||
# print(datetime)
|
||
# print(f'----------------------------------------------------')
|
||
# print(content)
|
||
|
||
return (title, datetime, content)
|