sdk/qywx-sdk/url_md_handle.py

656 lines
23 KiB
Python
Raw Blame History

This file contains invisible Unicode characters

This file contains invisible Unicode characters that are indistinguishable to humans but may be processed differently by a computer. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/env python
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
import re
import json
import time
import random
import requests
from bs4 import NavigableString, BeautifulSoup as bs
from markitdown import MarkItDown
inline_tags = ['a', 'img', 'b', 'strong', 'em', 'i', 'code', 'del']
# block_tags = ['h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'p', 'hr', 'blockquote', 'pre']
block_map = {
'normal': {
'h1': '\n# {}\n',
'h2': '\n## {}\n',
'h3': '\n### {}\n',
'h4': '\n#### {}\n',
'h5': '\n##### {}\n',
'h6': '\n###### {}\n',
'hr': '\n\n---\n',
'div': '{}',
'article': '{}',
'section': '{}'
},
'intent': {
'p': '\n{}{}\n',
'blockquote': '{}> {}',
'pre': '\n{}```{}\n{}\n{}```\n'
}
}
inline_map = {
'normal': {
'i': '*{}*',
'em': '*{}*',
'b': '**{}**',
'strong': '**{}**',
'del': '~~{}~~',
"code": '`{}`'
},
'link': {
'a': '[{}]({})',
'img': '![{}]({})'
}
}
# 从 html 中提取纯文本
def get_text_from_html(html_text, is_file=False):
html_content = html_text
# 如果是文件的话, 从文件读取数据
if is_file:
with open(html_text, "r", encoding="utf-8") as file_obj:
html_content = file_obj.read()
# 将HTML页面转换为BeautifulSoup对象
soup = bs(html_content, "html.parser")
# 获取 text 文本内容
raw_text = soup.get_text()
return raw_text
# 根据 url 抓取软文数据, 转为 markdown 格式输出
def url_to_markdown(url):
title = ""
md = ""
try:
headers = {
"Accept": "application/json, text/plain, */*",
"Accept-Encoding": "gzip, deflate, br",
"Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6",
"Connection": "keep-alive",
"Sec-Ch-Ua": '"Not A(Brand";v="99", "Microsoft Edge";v="121", "Chromium";v="121"',
"Sec-Ch-Ua-Mobile": "?0",
"Sec-Ch-Ua-Platform": '"Windows"',
"Sec-Fetch-Dest": 'empty',
"Sec-Fetch-Mode": 'cors',
"Sec-Fetch-Site": 'same-origin',
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36 Edg/121.0.0.0"
}
# 获取 html 数据
res = requests.get(url, headers=headers)
html_content = res.text
# 提取 title
soup = bs(html_content, 'html.parser')
title = soup.title.string
# 提取并转换为 markdown
html_md = convert(html_content)
# 再过滤一遍 html 内容
md = get_text_from_html(html_md)
# print(md)
# with open('test.md', "w", encoding="utf-8") as file_obj:
# file_obj.write(md)
except Exception as e:
print(f'--url_to_markdown--> 爬取异常报错e={e}, url={url}')
# 返回
return (title, md)
# 根据 url 抓取软文数据
def url_to_raw_html(url):
html_content = ""
try:
headers = {
"Accept": "application/json, text/plain, */*",
"Accept-Encoding": "gzip, deflate, br",
"Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6",
"Connection": "keep-alive",
"Sec-Ch-Ua": '"Not A(Brand";v="99", "Microsoft Edge";v="121", "Chromium";v="121"',
"Sec-Ch-Ua-Mobile": "?0",
"Sec-Ch-Ua-Platform": '"Windows"',
"Sec-Fetch-Dest": 'empty',
"Sec-Fetch-Mode": 'cors',
"Sec-Fetch-Site": 'same-origin',
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36 Edg/121.0.0.0"
}
# 获取 html 数据
res = requests.get(url, headers=headers)
html_content = res.content
except Exception as e:
print(f'--url_to_raw_html--> 爬取异常报错e={e}, url={url}')
# 返回
return html_content
# 根据 html 软文数据, 转为 markdown 格式输出
def html_to_markdown(html_content):
title = ""
md = ""
try:
# 提取 title
soup = bs(html_content, 'html.parser')
title = soup.title.string
# 提取并转换为 markdown
html_md = convert(html_content)
# 再过滤一遍 html 内容
md = get_text_from_html(html_md)
# print(md)
# with open('test.md', "w", encoding="utf-8") as file_obj:
# file_obj.write(md)
except Exception as e:
pass
return (title, md)
def cnblogs_list(user_name):
headers = {
"Accept": "application/json, text/plain, */*",
"Accept-Encoding": "gzip, deflate, br",
"Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6",
"Connection": "keep-alive",
"Cookie": "",
"Host": "",
"Referer": f"",
"Sec-Ch-Ua": '"Not A(Brand";v="99", "Microsoft Edge";v="121", "Chromium";v="121"',
"Sec-Ch-Ua-Mobile": "?0",
"Sec-Ch-Ua-Platform": '"Windows"',
"Sec-Fetch-Dest": 'empty',
"Sec-Fetch-Mode": 'cors',
"Sec-Fetch-Site": 'same-origin',
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36 Edg/121.0.0.0"
}
# 指定目标博客的 URL 地址
url = 'https://www.cnblogs.com/'+user_name+'/?page=1'
# 请求获取 HTML 内容
response = requests.get(url, headers=headers)
# 将 HTML 内容转化为 BeautifulSoup 对象
soup = bs(response.text, 'html.parser')
page_num = 1
# 使用select()选择分页信息中的所有页码元素
page_lists = soup.select('div.pager a')
for a_element in page_lists:
numbers = re.findall(r'\d+', a_element.text)
if numbers:
number = int(numbers[0]) # 将提取到的数字字符串转换为整数
if page_num < number:
page_num = number
paper_list=[]
articles = soup.select('div.day div.postTitle a')
for article in articles:
href =article['href']
# print(href)
paper_list.append(href)
if page_num>1:
random_wait_time = random.randint(1, 2)
time.sleep(random_wait_time)
for i in range(2,page_num+1):
url = 'https://www.cnblogs.com/'+user_name+'/?page='+str(i)
response = requests.get(url, headers=headers)
#print(response.status_code)
soup = bs(response.text, 'html.parser')
articles = soup.select('div.day div.postTitle a')
for article in articles:
href = article['href']
# print(href)
paper_list.append(href)
return paper_list
def csdn_list(user_name):
headers = {
"Accept": "application/json, text/plain, */*",
"Accept-Encoding": "gzip, deflate, br",
"Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6",
"Connection": "keep-alive",
"Cookie": "",
"Host": "blog.csdn.net",
"Referer": f"https://blog.csdn.net/{user_name}?type=blog",
"Sec-Ch-Ua": '"Not A(Brand";v="99", "Microsoft Edge";v="121", "Chromium";v="121"',
"Sec-Ch-Ua-Mobile": "?0",
"Sec-Ch-Ua-Platform": '"Windows"',
"Sec-Fetch-Dest": 'empty',
"Sec-Fetch-Mode": 'cors',
"Sec-Fetch-Site": 'same-origin',
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36 Edg/121.0.0.0"
}
# 指定目标博客的 URL 地址
url = 'https://blog.csdn.net/community/home-api/v1/get-tab-total?username=' + user_name
# 请求获取 HTML 内容
response = requests.get(url, headers=headers)
if response.status_code == 200:
data = response.json()
article_num = data['data']['blog']
# print(article_num)
page_num=article_num // 20 # 进行整除
remainder = article_num % 20 # 获取余数
if remainder > 0:
page_num += 1 # 如果余数大于0则多取一个
paper_list=[]
for i in range(1, page_num + 1):
url = 'https://blog.csdn.net/community/home-api/v1/get-business-list?page='+ str(i)+'&size=20&businessType=blog&orderby=&noMore=false&year=&month=&username=' + user_name
response = requests.get(url, headers=headers)
if response.status_code == 200:
data = response.json()
for item in data['data']['list']:
href = item['url']
# print(href)
paper_list.append(href)
return paper_list
def zhihu_list(user_name):
headers = {
"Accept": "application/json, text/plain, */*",
"Accept-Encoding": "gzip, deflate, br",
"Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6",
"Connection": "keep-alive",
"Cookie": "",
"Host": "",
"Referer": f"",
"Sec-Ch-Ua": '"Not A(Brand";v="99", "Microsoft Edge";v="121", "Chromium";v="121"',
"Sec-Ch-Ua-Mobile": "?0",
"Sec-Ch-Ua-Platform": '"Windows"',
"Sec-Fetch-Dest": 'empty',
"Sec-Fetch-Mode": 'cors',
"Sec-Fetch-Site": 'same-origin',
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36 Edg/121.0.0.0"
}
# 指定目标博客的 URL 地址
url = 'https://www.zhihu.com/people/'+user_name+'/posts?page=1'
# 请求获取 HTML 内容
response = requests.get(url, headers=headers)
# 将 HTML 内容转化为 BeautifulSoup 对象
soup = bs(response.text, 'html.parser')
page_num = 1
# 使用select()选择分页信息中的所有页码元素
page_lists = soup.select('div.Pagination button')
for a_element in page_lists:
numbers = re.findall(r'\d+', a_element.text)
if numbers:
number = int(numbers[0]) # 将提取到的数字字符串转换为整数
if page_num < number:
page_num = number
paper_list=[]
articles = soup.select('h2.ContentItem-title span a')
for article in articles:
href =article['href']
# print(href)
paper_list.append(href)
if page_num>1:
random_wait_time = random.randint(1, 2)
time.sleep(random_wait_time)
for i in range(2,page_num+1):
url = 'https://www.zhihu.com/people/'+user_name+'/posts?page='+str(i)
response = requests.get(url, headers=headers)
# print(response.status_code)
soup = bs(response.text, 'html.parser')
articles = soup.select('h2.ContentItem-title span a')
for article in articles:
href = article['href']
# print(href)
paper_list.append(href)
return paper_list
#cnblogs
#user_name = 'lingwang3'
#blog_name = 'cnblogs'
#cnblogs_list(user_name)
#csdn
#user_name = 'weixin_40340586'
#blog_name = 'csdn'
#csdn_list(user_name)
#zhihu ye-chi-4-96
# user_name = '--89-68-45'
# blog_name = 'zhihu'
# zhihu_list(user_name)
def convert(html):
soup = bs(html, 'html.parser')
# CSDN - #content_views
# cnblogs - #cnblogs_post_body
# zhihu - .Post-RichTextContainer
container = soup.select_one('#content_views') \
or soup.select_one('#cnblogs_post_body') \
or soup.select_one('.Post-RichTextContainer') \
or soup.select_one('article .post-container') \
or soup.select_one('article #content') \
or soup.select_one('.page-article .main-content .article-main .article-content') \
or soup.select_one('article') \
or soup.select_one('body') \
or soup
return __print_tree(container)
def __print_tree(ele, intent = 0, md = ''):
"""递归遍历DOM为了开发时间暂时就用递归了
Arguments:
ele {bs} -- 待解析元素
Keyword Arguments:
intent {int} -- 缩进值 (default: {0})
md {str} -- 转换后的文档 (default: {''})
Returns:
str -- 转换后的文档
"""
if isinstance(ele, NavigableString):
md = __transform_text(ele, md)
elif ele.name == 'img':
md = __transform_img(ele, md)
elif ele.name == 'a':
md = __transform_a(ele, md, intent)
elif ele.name in inline_map['normal'].keys():
md = __transform_inline_tags(ele, md, intent)
elif ele.name == 'pre':
md = __transform_pre(ele, md, intent)
elif ele.name in ('ul', 'ol'):
md = __transform_list_tags(ele, md, intent)
elif ele.name in block_map['normal'].keys():
md = __transform_block_normal_tags(ele, md, intent)
elif ele.name in block_map['intent'].keys():
md = __transform_block_intent_tags(ele, md, intent)
elif ele.name == '[document]':
md = __transform_soup(ele, md, intent)
else:
md = __transform_other_tags(ele, md, intent)
return md
def __transform_text(ele, md):
text = re.compile(r'[\s]+').sub(' ', ele.string)
text = text if ele.previous_sibling and ele.previous_sibling.name in inline_tags else text.lstrip()
text = text if ele.next_sibling and ele.next_sibling.name in inline_tags else text.rstrip()
md += text
return md
def __transform_img(ele, md):
md += inline_map['link']['img'].format(ele.get('alt') or '', ele.get('src') or '')
return md
def __transform_a(ele, md, intent):
a_inner = ''
for child in ele.children:
a_inner = __print_tree(child, intent, a_inner)
if a_inner != '':
md += inline_map['link']['a'].format(a_inner, ele.get('href') or ele.get_text(strip=True))
return md
def __transform_pre(ele, md, intent):
lang_tag = ele.find(class_='hljs')
if lang_tag: lang_tag['class'].remove('hljs')
lang = ''.join(lang_tag['class']) if lang_tag else ''
md += block_map['intent']['pre'].format(' ' * intent, lang, ele.text.strip().replace('\n', '\n' + ' ' * intent), ' ' * intent)
return md
def __transform_inline_tags(ele, md, intent):
inline_tag_inner = ''
for child in ele.children:
inline_tag_inner = __print_tree(child, intent, inline_tag_inner)
if inline_tag_inner:
md += inline_map['normal'][ele.name].format(inline_tag_inner)
return md
def __transform_block_normal_tags(ele, md, intent):
block_tag_inner = ''
for child in ele.children:
block_tag_inner = __print_tree(child, intent, block_tag_inner)
md += block_map['normal'][ele.name].format(block_tag_inner)
return md
def __transform_block_intent_tags(ele, md, intent):
block_tag_inner = ''
tpl = block_map['intent'][ele.name]
prev = ' ' * intent
if ele.parent.name == 'blockquote':
prev = ele.parent['data-prev']
ele['data-prev'] = ele.parent['data-prev'] + '> '
tpl = ele.parent['data-prev'] + '\n' + tpl + '\n'
elif ele.name == 'blockquote':
tpl = '\n' + tpl + '\n'
ele['data-prev'] = ' ' * intent + '> '
for child in ele.children:
block_tag_inner = __print_tree(child, intent, block_tag_inner)
tpl = __fill_newline_if_need(ele, tpl)
md += tpl.format(prev, block_tag_inner)
return md
def __transform_other_tags(ele, md, intent):
other_inner = ''
for child in ele.children:
other_inner = __print_tree(child, intent, other_inner)
ele.clear()
ele.append('{}')
md += ele.decode().format(other_inner)
return md
def __transform_list_tags(ele, md, intent):
list_text = '\n'
if ele.find_parent(re.compile('[ou]l')): intent += 4
line_head = '* ' if ele.name == 'ul' else '{}. '
for i, e in enumerate(ele.find_all('li', recursive=False)):
li_inner = ''
for child in e.children:
li_inner = __print_tree(child, intent, li_inner)
list_text += ' ' * intent + line_head.format(i + 1) + li_inner.lstrip() + '\n'
md += __fill_newline_if_need(ele, list_text) if list_text.strip() != '' else ''
return md
def __transform_soup(ele, md, intent):
for child in ele.children:
md = __print_tree(child, intent, md)
return md
def __fill_newline_if_need(ele, text):
if ele.next_sibling and ele.next_sibling.name in inline_map['normal'].keys() \
or isinstance(ele.next_sibling, NavigableString) and ele.next_sibling.string.strip() != '':
text += '\n'
if ele.previous_sibling and ele.previous_sibling in inline_map['normal'].keys()\
or isinstance(ele.previous_sibling, NavigableString) and ele.previous_sibling.string.strip() != '':
text = '\n' + text
return text
#########################################################################################
# 从 document rich 文档中提取 html 文本
def recursion_json_to_html(data):
html_data = ""
# 元素
if (("type" in data) and data["type"]):
# 标签创建
if (("style" in data) and data["style"]):
# html_data = html_data + "<" + data["type"] + " style=\"" + data["style"] + "\"" + ">";
pass
else:
# html_data = html_data + "<" + data["type"] + ">";
pass
# 子元素
if ("children" in data):
for children_item in data["children"]:
tmp_data = recursion_json_to_html(children_item)
html_data = html_data + tmp_data
# 标签结束
# html_data = html_data + "</" + data["type"] + ">";
# 内容
if (("text" in data) and data["text"]):
html_data = html_data + data["text"]
# 返回
return html_data
# 从 document rich 文档中提取 html 文本
def json_to_html(json_str):
html_data = ""
data = json.loads(json_str)
# 递归转换
html_data = recursion_json_to_html(data)
# 返回
return html_data
# 从 json 中提取 text
def json_to_text(json_str):
# 从 document rich 文档中提取 html 文本
html_content = json_to_html(json_str)
# html 提取 text
text_content = get_text_from_html(html_content)
return text_content
# AI Rich Editor 转 TXT 文本
def ai_rich_json_to_text(json_str):
content = ""
if (json_str):
data = json.loads(json_str)
if ("doc" == data.get("type", None)):
for item_i in data.get("content"):
tmp_item_i_content = item_i.get("content", None)
if (tmp_item_i_content):
for item_j in tmp_item_i_content:
if ("text" == item_j.get("type", None)):
tmp_text = item_j.get("text", "")
content = content + tmp_text
# 返回
return content
# Excel --> markdown
def excel_to_markdown(file_path):
print(f'====================excel_to_markdown====1==================')
md = MarkItDown()
result = md.convert(file_path)
print(result.text_content)
return result.text_content
def get_hotnew_url(url):
yaowen_url = []
rebang_url = []
jishi_url = []
# 获取网页内容
html = url_to_raw_html(url)
html_other = url_to_raw_html(url + "/scroll-news/news1.html")
# 将 HTML 内容转化为 BeautifulSoup 对象
soup = bs(html, 'html.parser')
soup_other = bs(html_other, 'html.parser')
# 使用select()选择分页信息中的所有页码元素
yaowen_lists = soup.select('.news-left .ywjx-news-list ul li a') # 要闻
rebang_lists = soup.select('.news-left .rdph-list.rdph-list2 ul li a') # 中新热榜
jishi_lists = soup_other.select('.content-left .content_list ul li .dd_bt a') # 即时新闻
# 要闻
for item in yaowen_lists:
tmp_url = item["href"]
if ("://" in tmp_url):
pass
elif ("//" in tmp_url):
tmp_url = "https:" + tmp_url
else:
tmp_url = url + tmp_url
# 添加
yaowen_url.append(tmp_url)
# 中新热榜
for item in rebang_lists:
tmp_url = item["href"]
if ("://" in tmp_url):
pass
elif ("//" in tmp_url):
tmp_url = "https:" + tmp_url
else:
tmp_url = url + tmp_url
# 添加
rebang_url.append(tmp_url)
# 即时新闻
for item in jishi_lists:
tmp_url = item["href"]
if ("://" in tmp_url):
pass
elif ("//" in tmp_url):
tmp_url = "https:" + tmp_url
else:
tmp_url = url + tmp_url
# 添加
jishi_url.append(tmp_url)
# print(yaowen_url)
# print(f'----------------------------------------------------')
# print(rebang_url)
# print(f'----------------------------------------------------')
# print(jishi_url)
return (yaowen_url, rebang_url, jishi_url)
def get_hotnews_content(url):
title = ""
datetime = ""
content = ""
# 获取网页内容
html = url_to_raw_html(url)
# 将 HTML 内容转化为 BeautifulSoup 对象
soup = bs(html, 'html.parser')
# 使用select()选择分页信息中的所有页码元素
title_element = soup.select('.content .content_maincontent_more h1') # 标题
datetime_element = soup.select('.content .content_maincontent_more .content_left_time') # 日期+时间
content_element = soup.select('.content .content_maincontent_more .content_maincontent_content .left_zw') # 内容
# 获取标题
for item in title_element:
title = item.text if item.text else ""
# 获取日期时间2025-04-08 02:55
for item in datetime_element:
# item.text --> 2025年04月07日 23:55 来源
datetime = item.contents[0] if (item.contents and (0<len(item.contents))) else ""
datetime = datetime.replace(" ", "").replace(" 来源:", "").replace("\r", "").replace("\n", "").replace("", "-").replace("", "-").replace("", " ")
datetime = datetime[0:16]
# 获取内容
for item in content_element:
content = item.text if item.text else ""
# print(title)
# print(f'----------------------------------------------------')
# print(datetime)
# print(f'----------------------------------------------------')
# print(content)
return (title, datetime, content)