sdk/qywx-sdk/url_md_handle.py

656 lines
23 KiB
Python
Raw Normal View History

2025-08-18 09:05:41 +00:00
#!/usr/bin/env python
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
import re
import json
import time
import random
import requests
from bs4 import NavigableString, BeautifulSoup as bs
from markitdown import MarkItDown
inline_tags = ['a', 'img', 'b', 'strong', 'em', 'i', 'code', 'del']
# block_tags = ['h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'p', 'hr', 'blockquote', 'pre']
block_map = {
'normal': {
'h1': '\n# {}\n',
'h2': '\n## {}\n',
'h3': '\n### {}\n',
'h4': '\n#### {}\n',
'h5': '\n##### {}\n',
'h6': '\n###### {}\n',
'hr': '\n\n---\n',
'div': '{}',
'article': '{}',
'section': '{}'
},
'intent': {
'p': '\n{}{}\n',
'blockquote': '{}> {}',
'pre': '\n{}```{}\n{}\n{}```\n'
}
}
inline_map = {
'normal': {
'i': '*{}*',
'em': '*{}*',
'b': '**{}**',
'strong': '**{}**',
'del': '~~{}~~',
"code": '`{}`'
},
'link': {
'a': '[{}]({})',
'img': '![{}]({})'
}
}
# 从 html 中提取纯文本
def get_text_from_html(html_text, is_file=False):
html_content = html_text
# 如果是文件的话, 从文件读取数据
if is_file:
with open(html_text, "r", encoding="utf-8") as file_obj:
html_content = file_obj.read()
# 将HTML页面转换为BeautifulSoup对象
soup = bs(html_content, "html.parser")
# 获取 text 文本内容
raw_text = soup.get_text()
return raw_text
# 根据 url 抓取软文数据, 转为 markdown 格式输出
def url_to_markdown(url):
title = ""
md = ""
try:
headers = {
"Accept": "application/json, text/plain, */*",
"Accept-Encoding": "gzip, deflate, br",
"Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6",
"Connection": "keep-alive",
"Sec-Ch-Ua": '"Not A(Brand";v="99", "Microsoft Edge";v="121", "Chromium";v="121"',
"Sec-Ch-Ua-Mobile": "?0",
"Sec-Ch-Ua-Platform": '"Windows"',
"Sec-Fetch-Dest": 'empty',
"Sec-Fetch-Mode": 'cors',
"Sec-Fetch-Site": 'same-origin',
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36 Edg/121.0.0.0"
}
# 获取 html 数据
res = requests.get(url, headers=headers)
html_content = res.text
# 提取 title
soup = bs(html_content, 'html.parser')
title = soup.title.string
# 提取并转换为 markdown
html_md = convert(html_content)
# 再过滤一遍 html 内容
md = get_text_from_html(html_md)
# print(md)
# with open('test.md', "w", encoding="utf-8") as file_obj:
# file_obj.write(md)
except Exception as e:
print(f'--url_to_markdown--> 爬取异常报错e={e}, url={url}')
# 返回
return (title, md)
# 根据 url 抓取软文数据
def url_to_raw_html(url):
html_content = ""
try:
headers = {
"Accept": "application/json, text/plain, */*",
"Accept-Encoding": "gzip, deflate, br",
"Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6",
"Connection": "keep-alive",
"Sec-Ch-Ua": '"Not A(Brand";v="99", "Microsoft Edge";v="121", "Chromium";v="121"',
"Sec-Ch-Ua-Mobile": "?0",
"Sec-Ch-Ua-Platform": '"Windows"',
"Sec-Fetch-Dest": 'empty',
"Sec-Fetch-Mode": 'cors',
"Sec-Fetch-Site": 'same-origin',
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36 Edg/121.0.0.0"
}
# 获取 html 数据
res = requests.get(url, headers=headers)
html_content = res.content
except Exception as e:
print(f'--url_to_raw_html--> 爬取异常报错e={e}, url={url}')
# 返回
return html_content
# 根据 html 软文数据, 转为 markdown 格式输出
def html_to_markdown(html_content):
title = ""
md = ""
try:
# 提取 title
soup = bs(html_content, 'html.parser')
title = soup.title.string
# 提取并转换为 markdown
html_md = convert(html_content)
# 再过滤一遍 html 内容
md = get_text_from_html(html_md)
# print(md)
# with open('test.md', "w", encoding="utf-8") as file_obj:
# file_obj.write(md)
except Exception as e:
pass
return (title, md)
def cnblogs_list(user_name):
headers = {
"Accept": "application/json, text/plain, */*",
"Accept-Encoding": "gzip, deflate, br",
"Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6",
"Connection": "keep-alive",
"Cookie": "",
"Host": "",
"Referer": f"",
"Sec-Ch-Ua": '"Not A(Brand";v="99", "Microsoft Edge";v="121", "Chromium";v="121"',
"Sec-Ch-Ua-Mobile": "?0",
"Sec-Ch-Ua-Platform": '"Windows"',
"Sec-Fetch-Dest": 'empty',
"Sec-Fetch-Mode": 'cors',
"Sec-Fetch-Site": 'same-origin',
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36 Edg/121.0.0.0"
}
# 指定目标博客的 URL 地址
url = 'https://www.cnblogs.com/'+user_name+'/?page=1'
# 请求获取 HTML 内容
response = requests.get(url, headers=headers)
# 将 HTML 内容转化为 BeautifulSoup 对象
soup = bs(response.text, 'html.parser')
page_num = 1
# 使用select()选择分页信息中的所有页码元素
page_lists = soup.select('div.pager a')
for a_element in page_lists:
numbers = re.findall(r'\d+', a_element.text)
if numbers:
number = int(numbers[0]) # 将提取到的数字字符串转换为整数
if page_num < number:
page_num = number
paper_list=[]
articles = soup.select('div.day div.postTitle a')
for article in articles:
href =article['href']
# print(href)
paper_list.append(href)
if page_num>1:
random_wait_time = random.randint(1, 2)
time.sleep(random_wait_time)
for i in range(2,page_num+1):
url = 'https://www.cnblogs.com/'+user_name+'/?page='+str(i)
response = requests.get(url, headers=headers)
#print(response.status_code)
soup = bs(response.text, 'html.parser')
articles = soup.select('div.day div.postTitle a')
for article in articles:
href = article['href']
# print(href)
paper_list.append(href)
return paper_list
def csdn_list(user_name):
headers = {
"Accept": "application/json, text/plain, */*",
"Accept-Encoding": "gzip, deflate, br",
"Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6",
"Connection": "keep-alive",
"Cookie": "",
"Host": "blog.csdn.net",
"Referer": f"https://blog.csdn.net/{user_name}?type=blog",
"Sec-Ch-Ua": '"Not A(Brand";v="99", "Microsoft Edge";v="121", "Chromium";v="121"',
"Sec-Ch-Ua-Mobile": "?0",
"Sec-Ch-Ua-Platform": '"Windows"',
"Sec-Fetch-Dest": 'empty',
"Sec-Fetch-Mode": 'cors',
"Sec-Fetch-Site": 'same-origin',
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36 Edg/121.0.0.0"
}
# 指定目标博客的 URL 地址
url = 'https://blog.csdn.net/community/home-api/v1/get-tab-total?username=' + user_name
# 请求获取 HTML 内容
response = requests.get(url, headers=headers)
if response.status_code == 200:
data = response.json()
article_num = data['data']['blog']
# print(article_num)
page_num=article_num // 20 # 进行整除
remainder = article_num % 20 # 获取余数
if remainder > 0:
page_num += 1 # 如果余数大于0则多取一个
paper_list=[]
for i in range(1, page_num + 1):
url = 'https://blog.csdn.net/community/home-api/v1/get-business-list?page='+ str(i)+'&size=20&businessType=blog&orderby=&noMore=false&year=&month=&username=' + user_name
response = requests.get(url, headers=headers)
if response.status_code == 200:
data = response.json()
for item in data['data']['list']:
href = item['url']
# print(href)
paper_list.append(href)
return paper_list
def zhihu_list(user_name):
headers = {
"Accept": "application/json, text/plain, */*",
"Accept-Encoding": "gzip, deflate, br",
"Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6",
"Connection": "keep-alive",
"Cookie": "",
"Host": "",
"Referer": f"",
"Sec-Ch-Ua": '"Not A(Brand";v="99", "Microsoft Edge";v="121", "Chromium";v="121"',
"Sec-Ch-Ua-Mobile": "?0",
"Sec-Ch-Ua-Platform": '"Windows"',
"Sec-Fetch-Dest": 'empty',
"Sec-Fetch-Mode": 'cors',
"Sec-Fetch-Site": 'same-origin',
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36 Edg/121.0.0.0"
}
# 指定目标博客的 URL 地址
url = 'https://www.zhihu.com/people/'+user_name+'/posts?page=1'
# 请求获取 HTML 内容
response = requests.get(url, headers=headers)
# 将 HTML 内容转化为 BeautifulSoup 对象
soup = bs(response.text, 'html.parser')
page_num = 1
# 使用select()选择分页信息中的所有页码元素
page_lists = soup.select('div.Pagination button')
for a_element in page_lists:
numbers = re.findall(r'\d+', a_element.text)
if numbers:
number = int(numbers[0]) # 将提取到的数字字符串转换为整数
if page_num < number:
page_num = number
paper_list=[]
articles = soup.select('h2.ContentItem-title span a')
for article in articles:
href =article['href']
# print(href)
paper_list.append(href)
if page_num>1:
random_wait_time = random.randint(1, 2)
time.sleep(random_wait_time)
for i in range(2,page_num+1):
url = 'https://www.zhihu.com/people/'+user_name+'/posts?page='+str(i)
response = requests.get(url, headers=headers)
# print(response.status_code)
soup = bs(response.text, 'html.parser')
articles = soup.select('h2.ContentItem-title span a')
for article in articles:
href = article['href']
# print(href)
paper_list.append(href)
return paper_list
#cnblogs
#user_name = 'lingwang3'
#blog_name = 'cnblogs'
#cnblogs_list(user_name)
#csdn
#user_name = 'weixin_40340586'
#blog_name = 'csdn'
#csdn_list(user_name)
#zhihu ye-chi-4-96
# user_name = '--89-68-45'
# blog_name = 'zhihu'
# zhihu_list(user_name)
def convert(html):
soup = bs(html, 'html.parser')
# CSDN - #content_views
# cnblogs - #cnblogs_post_body
# zhihu - .Post-RichTextContainer
container = soup.select_one('#content_views') \
or soup.select_one('#cnblogs_post_body') \
or soup.select_one('.Post-RichTextContainer') \
or soup.select_one('article .post-container') \
or soup.select_one('article #content') \
or soup.select_one('.page-article .main-content .article-main .article-content') \
or soup.select_one('article') \
or soup.select_one('body') \
or soup
return __print_tree(container)
def __print_tree(ele, intent = 0, md = ''):
"""递归遍历DOM为了开发时间暂时就用递归了
Arguments:
ele {bs} -- 待解析元素
Keyword Arguments:
intent {int} -- 缩进值 (default: {0})
md {str} -- 转换后的文档 (default: {''})
Returns:
str -- 转换后的文档
"""
if isinstance(ele, NavigableString):
md = __transform_text(ele, md)
elif ele.name == 'img':
md = __transform_img(ele, md)
elif ele.name == 'a':
md = __transform_a(ele, md, intent)
elif ele.name in inline_map['normal'].keys():
md = __transform_inline_tags(ele, md, intent)
elif ele.name == 'pre':
md = __transform_pre(ele, md, intent)
elif ele.name in ('ul', 'ol'):
md = __transform_list_tags(ele, md, intent)
elif ele.name in block_map['normal'].keys():
md = __transform_block_normal_tags(ele, md, intent)
elif ele.name in block_map['intent'].keys():
md = __transform_block_intent_tags(ele, md, intent)
elif ele.name == '[document]':
md = __transform_soup(ele, md, intent)
else:
md = __transform_other_tags(ele, md, intent)
return md
def __transform_text(ele, md):
text = re.compile(r'[\s]+').sub(' ', ele.string)
text = text if ele.previous_sibling and ele.previous_sibling.name in inline_tags else text.lstrip()
text = text if ele.next_sibling and ele.next_sibling.name in inline_tags else text.rstrip()
md += text
return md
def __transform_img(ele, md):
md += inline_map['link']['img'].format(ele.get('alt') or '', ele.get('src') or '')
return md
def __transform_a(ele, md, intent):
a_inner = ''
for child in ele.children:
a_inner = __print_tree(child, intent, a_inner)
if a_inner != '':
md += inline_map['link']['a'].format(a_inner, ele.get('href') or ele.get_text(strip=True))
return md
def __transform_pre(ele, md, intent):
lang_tag = ele.find(class_='hljs')
if lang_tag: lang_tag['class'].remove('hljs')
lang = ''.join(lang_tag['class']) if lang_tag else ''
md += block_map['intent']['pre'].format(' ' * intent, lang, ele.text.strip().replace('\n', '\n' + ' ' * intent), ' ' * intent)
return md
def __transform_inline_tags(ele, md, intent):
inline_tag_inner = ''
for child in ele.children:
inline_tag_inner = __print_tree(child, intent, inline_tag_inner)
if inline_tag_inner:
md += inline_map['normal'][ele.name].format(inline_tag_inner)
return md
def __transform_block_normal_tags(ele, md, intent):
block_tag_inner = ''
for child in ele.children:
block_tag_inner = __print_tree(child, intent, block_tag_inner)
md += block_map['normal'][ele.name].format(block_tag_inner)
return md
def __transform_block_intent_tags(ele, md, intent):
block_tag_inner = ''
tpl = block_map['intent'][ele.name]
prev = ' ' * intent
if ele.parent.name == 'blockquote':
prev = ele.parent['data-prev']
ele['data-prev'] = ele.parent['data-prev'] + '> '
tpl = ele.parent['data-prev'] + '\n' + tpl + '\n'
elif ele.name == 'blockquote':
tpl = '\n' + tpl + '\n'
ele['data-prev'] = ' ' * intent + '> '
for child in ele.children:
block_tag_inner = __print_tree(child, intent, block_tag_inner)
tpl = __fill_newline_if_need(ele, tpl)
md += tpl.format(prev, block_tag_inner)
return md
def __transform_other_tags(ele, md, intent):
other_inner = ''
for child in ele.children:
other_inner = __print_tree(child, intent, other_inner)
ele.clear()
ele.append('{}')
md += ele.decode().format(other_inner)
return md
def __transform_list_tags(ele, md, intent):
list_text = '\n'
if ele.find_parent(re.compile('[ou]l')): intent += 4
line_head = '* ' if ele.name == 'ul' else '{}. '
for i, e in enumerate(ele.find_all('li', recursive=False)):
li_inner = ''
for child in e.children:
li_inner = __print_tree(child, intent, li_inner)
list_text += ' ' * intent + line_head.format(i + 1) + li_inner.lstrip() + '\n'
md += __fill_newline_if_need(ele, list_text) if list_text.strip() != '' else ''
return md
def __transform_soup(ele, md, intent):
for child in ele.children:
md = __print_tree(child, intent, md)
return md
def __fill_newline_if_need(ele, text):
if ele.next_sibling and ele.next_sibling.name in inline_map['normal'].keys() \
or isinstance(ele.next_sibling, NavigableString) and ele.next_sibling.string.strip() != '':
text += '\n'
if ele.previous_sibling and ele.previous_sibling in inline_map['normal'].keys()\
or isinstance(ele.previous_sibling, NavigableString) and ele.previous_sibling.string.strip() != '':
text = '\n' + text
return text
#########################################################################################
# 从 document rich 文档中提取 html 文本
def recursion_json_to_html(data):
html_data = ""
# 元素
if (("type" in data) and data["type"]):
# 标签创建
if (("style" in data) and data["style"]):
# html_data = html_data + "<" + data["type"] + " style=\"" + data["style"] + "\"" + ">";
pass
else:
# html_data = html_data + "<" + data["type"] + ">";
pass
# 子元素
if ("children" in data):
for children_item in data["children"]:
tmp_data = recursion_json_to_html(children_item)
html_data = html_data + tmp_data
# 标签结束
# html_data = html_data + "</" + data["type"] + ">";
# 内容
if (("text" in data) and data["text"]):
html_data = html_data + data["text"]
# 返回
return html_data
# 从 document rich 文档中提取 html 文本
def json_to_html(json_str):
html_data = ""
data = json.loads(json_str)
# 递归转换
html_data = recursion_json_to_html(data)
# 返回
return html_data
# 从 json 中提取 text
def json_to_text(json_str):
# 从 document rich 文档中提取 html 文本
html_content = json_to_html(json_str)
# html 提取 text
text_content = get_text_from_html(html_content)
return text_content
# AI Rich Editor 转 TXT 文本
def ai_rich_json_to_text(json_str):
content = ""
if (json_str):
data = json.loads(json_str)
if ("doc" == data.get("type", None)):
for item_i in data.get("content"):
tmp_item_i_content = item_i.get("content", None)
if (tmp_item_i_content):
for item_j in tmp_item_i_content:
if ("text" == item_j.get("type", None)):
tmp_text = item_j.get("text", "")
content = content + tmp_text
# 返回
return content
# Excel --> markdown
def excel_to_markdown(file_path):
print(f'====================excel_to_markdown====1==================')
md = MarkItDown()
result = md.convert(file_path)
print(result.text_content)
return result.text_content
def get_hotnew_url(url):
yaowen_url = []
rebang_url = []
jishi_url = []
# 获取网页内容
html = url_to_raw_html(url)
html_other = url_to_raw_html(url + "/scroll-news/news1.html")
# 将 HTML 内容转化为 BeautifulSoup 对象
soup = bs(html, 'html.parser')
soup_other = bs(html_other, 'html.parser')
# 使用select()选择分页信息中的所有页码元素
yaowen_lists = soup.select('.news-left .ywjx-news-list ul li a') # 要闻
rebang_lists = soup.select('.news-left .rdph-list.rdph-list2 ul li a') # 中新热榜
jishi_lists = soup_other.select('.content-left .content_list ul li .dd_bt a') # 即时新闻
# 要闻
for item in yaowen_lists:
tmp_url = item["href"]
if ("://" in tmp_url):
pass
elif ("//" in tmp_url):
tmp_url = "https:" + tmp_url
else:
tmp_url = url + tmp_url
# 添加
yaowen_url.append(tmp_url)
# 中新热榜
for item in rebang_lists:
tmp_url = item["href"]
if ("://" in tmp_url):
pass
elif ("//" in tmp_url):
tmp_url = "https:" + tmp_url
else:
tmp_url = url + tmp_url
# 添加
rebang_url.append(tmp_url)
# 即时新闻
for item in jishi_lists:
tmp_url = item["href"]
if ("://" in tmp_url):
pass
elif ("//" in tmp_url):
tmp_url = "https:" + tmp_url
else:
tmp_url = url + tmp_url
# 添加
jishi_url.append(tmp_url)
# print(yaowen_url)
# print(f'----------------------------------------------------')
# print(rebang_url)
# print(f'----------------------------------------------------')
# print(jishi_url)
return (yaowen_url, rebang_url, jishi_url)
def get_hotnews_content(url):
title = ""
datetime = ""
content = ""
# 获取网页内容
html = url_to_raw_html(url)
# 将 HTML 内容转化为 BeautifulSoup 对象
soup = bs(html, 'html.parser')
# 使用select()选择分页信息中的所有页码元素
title_element = soup.select('.content .content_maincontent_more h1') # 标题
datetime_element = soup.select('.content .content_maincontent_more .content_left_time') # 日期+时间
content_element = soup.select('.content .content_maincontent_more .content_maincontent_content .left_zw') # 内容
# 获取标题
for item in title_element:
title = item.text if item.text else ""
# 获取日期时间2025-04-08 02:55
for item in datetime_element:
# item.text --> 2025年04月07日 23:55 来源
datetime = item.contents[0] if (item.contents and (0<len(item.contents))) else ""
datetime = datetime.replace(" ", "").replace(" 来源:", "").replace("\r", "").replace("\n", "").replace("", "-").replace("", "-").replace("", " ")
datetime = datetime[0:16]
# 获取内容
for item in content_element:
content = item.text if item.text else ""
# print(title)
# print(f'----------------------------------------------------')
# print(datetime)
# print(f'----------------------------------------------------')
# print(content)
return (title, datetime, content)