#!/usr/bin/env python # Licensed to the Apache Software Foundation (ASF) under one # or more contributor license agreements. See the NOTICE file # distributed with this work for additional information # regarding copyright ownership. The ASF licenses this file # to you under the Apache License, Version 2.0 (the # "License"); you may not use this file except in compliance # with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. import re import json import time import random import requests from bs4 import NavigableString, BeautifulSoup as bs from markitdown import MarkItDown inline_tags = ['a', 'img', 'b', 'strong', 'em', 'i', 'code', 'del'] # block_tags = ['h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'p', 'hr', 'blockquote', 'pre'] block_map = { 'normal': { 'h1': '\n# {}\n', 'h2': '\n## {}\n', 'h3': '\n### {}\n', 'h4': '\n#### {}\n', 'h5': '\n##### {}\n', 'h6': '\n###### {}\n', 'hr': '\n\n---\n', 'div': '{}', 'article': '{}', 'section': '{}' }, 'intent': { 'p': '\n{}{}\n', 'blockquote': '{}> {}', 'pre': '\n{}```{}\n{}\n{}```\n' } } inline_map = { 'normal': { 'i': '*{}*', 'em': '*{}*', 'b': '**{}**', 'strong': '**{}**', 'del': '~~{}~~', "code": '`{}`' }, 'link': { 'a': '[{}]({})', 'img': '![{}]({})' } } # 从 html 中提取纯文本 def get_text_from_html(html_text, is_file=False): html_content = html_text # 如果是文件的话, 从文件读取数据 if is_file: with open(html_text, "r", encoding="utf-8") as file_obj: html_content = file_obj.read() # 将HTML页面转换为BeautifulSoup对象 soup = bs(html_content, "html.parser") # 获取 text 文本内容 raw_text = soup.get_text() return raw_text # 根据 url 抓取软文数据, 转为 markdown 格式输出 def url_to_markdown(url): title = "" md = "" try: headers = { "Accept": "application/json, text/plain, */*", "Accept-Encoding": "gzip, deflate, br", "Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6", "Connection": "keep-alive", "Sec-Ch-Ua": '"Not A(Brand";v="99", "Microsoft Edge";v="121", "Chromium";v="121"', "Sec-Ch-Ua-Mobile": "?0", "Sec-Ch-Ua-Platform": '"Windows"', "Sec-Fetch-Dest": 'empty', "Sec-Fetch-Mode": 'cors', "Sec-Fetch-Site": 'same-origin', "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36 Edg/121.0.0.0" } # 获取 html 数据 res = requests.get(url, headers=headers) html_content = res.text # 提取 title soup = bs(html_content, 'html.parser') title = soup.title.string # 提取并转换为 markdown html_md = convert(html_content) # 再过滤一遍 html 内容 md = get_text_from_html(html_md) # print(md) # with open('test.md', "w", encoding="utf-8") as file_obj: # file_obj.write(md) except Exception as e: print(f'--url_to_markdown--> 爬取异常报错:e={e}, url={url}') # 返回 return (title, md) # 根据 url 抓取软文数据 def url_to_raw_html(url): html_content = "" try: headers = { "Accept": "application/json, text/plain, */*", "Accept-Encoding": "gzip, deflate, br", "Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6", "Connection": "keep-alive", "Sec-Ch-Ua": '"Not A(Brand";v="99", "Microsoft Edge";v="121", "Chromium";v="121"', "Sec-Ch-Ua-Mobile": "?0", "Sec-Ch-Ua-Platform": '"Windows"', "Sec-Fetch-Dest": 'empty', "Sec-Fetch-Mode": 'cors', "Sec-Fetch-Site": 'same-origin', "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36 Edg/121.0.0.0" } # 获取 html 数据 res = requests.get(url, headers=headers) html_content = res.content except Exception as e: print(f'--url_to_raw_html--> 爬取异常报错:e={e}, url={url}') # 返回 return html_content # 根据 html 软文数据, 转为 markdown 格式输出 def html_to_markdown(html_content): title = "" md = "" try: # 提取 title soup = bs(html_content, 'html.parser') title = soup.title.string # 提取并转换为 markdown html_md = convert(html_content) # 再过滤一遍 html 内容 md = get_text_from_html(html_md) # print(md) # with open('test.md', "w", encoding="utf-8") as file_obj: # file_obj.write(md) except Exception as e: pass return (title, md) def cnblogs_list(user_name): headers = { "Accept": "application/json, text/plain, */*", "Accept-Encoding": "gzip, deflate, br", "Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6", "Connection": "keep-alive", "Cookie": "", "Host": "", "Referer": f"", "Sec-Ch-Ua": '"Not A(Brand";v="99", "Microsoft Edge";v="121", "Chromium";v="121"', "Sec-Ch-Ua-Mobile": "?0", "Sec-Ch-Ua-Platform": '"Windows"', "Sec-Fetch-Dest": 'empty', "Sec-Fetch-Mode": 'cors', "Sec-Fetch-Site": 'same-origin', "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36 Edg/121.0.0.0" } # 指定目标博客的 URL 地址 url = 'https://www.cnblogs.com/'+user_name+'/?page=1' # 请求获取 HTML 内容 response = requests.get(url, headers=headers) # 将 HTML 内容转化为 BeautifulSoup 对象 soup = bs(response.text, 'html.parser') page_num = 1 # 使用select()选择分页信息中的所有页码元素 page_lists = soup.select('div.pager a') for a_element in page_lists: numbers = re.findall(r'\d+', a_element.text) if numbers: number = int(numbers[0]) # 将提取到的数字字符串转换为整数 if page_num < number: page_num = number paper_list=[] articles = soup.select('div.day div.postTitle a') for article in articles: href =article['href'] # print(href) paper_list.append(href) if page_num>1: random_wait_time = random.randint(1, 2) time.sleep(random_wait_time) for i in range(2,page_num+1): url = 'https://www.cnblogs.com/'+user_name+'/?page='+str(i) response = requests.get(url, headers=headers) #print(response.status_code) soup = bs(response.text, 'html.parser') articles = soup.select('div.day div.postTitle a') for article in articles: href = article['href'] # print(href) paper_list.append(href) return paper_list def csdn_list(user_name): headers = { "Accept": "application/json, text/plain, */*", "Accept-Encoding": "gzip, deflate, br", "Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6", "Connection": "keep-alive", "Cookie": "", "Host": "blog.csdn.net", "Referer": f"https://blog.csdn.net/{user_name}?type=blog", "Sec-Ch-Ua": '"Not A(Brand";v="99", "Microsoft Edge";v="121", "Chromium";v="121"', "Sec-Ch-Ua-Mobile": "?0", "Sec-Ch-Ua-Platform": '"Windows"', "Sec-Fetch-Dest": 'empty', "Sec-Fetch-Mode": 'cors', "Sec-Fetch-Site": 'same-origin', "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36 Edg/121.0.0.0" } # 指定目标博客的 URL 地址 url = 'https://blog.csdn.net/community/home-api/v1/get-tab-total?username=' + user_name # 请求获取 HTML 内容 response = requests.get(url, headers=headers) if response.status_code == 200: data = response.json() article_num = data['data']['blog'] # print(article_num) page_num=article_num // 20 # 进行整除 remainder = article_num % 20 # 获取余数 if remainder > 0: page_num += 1 # 如果余数大于0,则多取一个 paper_list=[] for i in range(1, page_num + 1): url = 'https://blog.csdn.net/community/home-api/v1/get-business-list?page='+ str(i)+'&size=20&businessType=blog&orderby=&noMore=false&year=&month=&username=' + user_name response = requests.get(url, headers=headers) if response.status_code == 200: data = response.json() for item in data['data']['list']: href = item['url'] # print(href) paper_list.append(href) return paper_list def zhihu_list(user_name): headers = { "Accept": "application/json, text/plain, */*", "Accept-Encoding": "gzip, deflate, br", "Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6", "Connection": "keep-alive", "Cookie": "", "Host": "", "Referer": f"", "Sec-Ch-Ua": '"Not A(Brand";v="99", "Microsoft Edge";v="121", "Chromium";v="121"', "Sec-Ch-Ua-Mobile": "?0", "Sec-Ch-Ua-Platform": '"Windows"', "Sec-Fetch-Dest": 'empty', "Sec-Fetch-Mode": 'cors', "Sec-Fetch-Site": 'same-origin', "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36 Edg/121.0.0.0" } # 指定目标博客的 URL 地址 url = 'https://www.zhihu.com/people/'+user_name+'/posts?page=1' # 请求获取 HTML 内容 response = requests.get(url, headers=headers) # 将 HTML 内容转化为 BeautifulSoup 对象 soup = bs(response.text, 'html.parser') page_num = 1 # 使用select()选择分页信息中的所有页码元素 page_lists = soup.select('div.Pagination button') for a_element in page_lists: numbers = re.findall(r'\d+', a_element.text) if numbers: number = int(numbers[0]) # 将提取到的数字字符串转换为整数 if page_num < number: page_num = number paper_list=[] articles = soup.select('h2.ContentItem-title span a') for article in articles: href =article['href'] # print(href) paper_list.append(href) if page_num>1: random_wait_time = random.randint(1, 2) time.sleep(random_wait_time) for i in range(2,page_num+1): url = 'https://www.zhihu.com/people/'+user_name+'/posts?page='+str(i) response = requests.get(url, headers=headers) # print(response.status_code) soup = bs(response.text, 'html.parser') articles = soup.select('h2.ContentItem-title span a') for article in articles: href = article['href'] # print(href) paper_list.append(href) return paper_list #cnblogs #user_name = 'lingwang3' #blog_name = 'cnblogs' #cnblogs_list(user_name) #csdn #user_name = 'weixin_40340586' #blog_name = 'csdn' #csdn_list(user_name) #zhihu ye-chi-4-96 # user_name = '--89-68-45' # blog_name = 'zhihu' # zhihu_list(user_name) def convert(html): soup = bs(html, 'html.parser') # CSDN - #content_views # cnblogs - #cnblogs_post_body # zhihu - .Post-RichTextContainer container = soup.select_one('#content_views') \ or soup.select_one('#cnblogs_post_body') \ or soup.select_one('.Post-RichTextContainer') \ or soup.select_one('article .post-container') \ or soup.select_one('article #content') \ or soup.select_one('.page-article .main-content .article-main .article-content') \ or soup.select_one('article') \ or soup.select_one('body') \ or soup return __print_tree(container) def __print_tree(ele, intent = 0, md = ''): """递归遍历DOM,为了开发时间暂时就用递归了 Arguments: ele {bs} -- 待解析元素 Keyword Arguments: intent {int} -- 缩进值 (default: {0}) md {str} -- 转换后的文档 (default: {''}) Returns: str -- 转换后的文档 """ if isinstance(ele, NavigableString): md = __transform_text(ele, md) elif ele.name == 'img': md = __transform_img(ele, md) elif ele.name == 'a': md = __transform_a(ele, md, intent) elif ele.name in inline_map['normal'].keys(): md = __transform_inline_tags(ele, md, intent) elif ele.name == 'pre': md = __transform_pre(ele, md, intent) elif ele.name in ('ul', 'ol'): md = __transform_list_tags(ele, md, intent) elif ele.name in block_map['normal'].keys(): md = __transform_block_normal_tags(ele, md, intent) elif ele.name in block_map['intent'].keys(): md = __transform_block_intent_tags(ele, md, intent) elif ele.name == '[document]': md = __transform_soup(ele, md, intent) else: md = __transform_other_tags(ele, md, intent) return md def __transform_text(ele, md): text = re.compile(r'[\s]+').sub(' ', ele.string) text = text if ele.previous_sibling and ele.previous_sibling.name in inline_tags else text.lstrip() text = text if ele.next_sibling and ele.next_sibling.name in inline_tags else text.rstrip() md += text return md def __transform_img(ele, md): md += inline_map['link']['img'].format(ele.get('alt') or '', ele.get('src') or '') return md def __transform_a(ele, md, intent): a_inner = '' for child in ele.children: a_inner = __print_tree(child, intent, a_inner) if a_inner != '': md += inline_map['link']['a'].format(a_inner, ele.get('href') or ele.get_text(strip=True)) return md def __transform_pre(ele, md, intent): lang_tag = ele.find(class_='hljs') if lang_tag: lang_tag['class'].remove('hljs') lang = ''.join(lang_tag['class']) if lang_tag else '' md += block_map['intent']['pre'].format(' ' * intent, lang, ele.text.strip().replace('\n', '\n' + ' ' * intent), ' ' * intent) return md def __transform_inline_tags(ele, md, intent): inline_tag_inner = '' for child in ele.children: inline_tag_inner = __print_tree(child, intent, inline_tag_inner) if inline_tag_inner: md += inline_map['normal'][ele.name].format(inline_tag_inner) return md def __transform_block_normal_tags(ele, md, intent): block_tag_inner = '' for child in ele.children: block_tag_inner = __print_tree(child, intent, block_tag_inner) md += block_map['normal'][ele.name].format(block_tag_inner) return md def __transform_block_intent_tags(ele, md, intent): block_tag_inner = '' tpl = block_map['intent'][ele.name] prev = ' ' * intent if ele.parent.name == 'blockquote': prev = ele.parent['data-prev'] ele['data-prev'] = ele.parent['data-prev'] + '> ' tpl = ele.parent['data-prev'] + '\n' + tpl + '\n' elif ele.name == 'blockquote': tpl = '\n' + tpl + '\n' ele['data-prev'] = ' ' * intent + '> ' for child in ele.children: block_tag_inner = __print_tree(child, intent, block_tag_inner) tpl = __fill_newline_if_need(ele, tpl) md += tpl.format(prev, block_tag_inner) return md def __transform_other_tags(ele, md, intent): other_inner = '' for child in ele.children: other_inner = __print_tree(child, intent, other_inner) ele.clear() ele.append('{}') md += ele.decode().format(other_inner) return md def __transform_list_tags(ele, md, intent): list_text = '\n' if ele.find_parent(re.compile('[ou]l')): intent += 4 line_head = '* ' if ele.name == 'ul' else '{}. ' for i, e in enumerate(ele.find_all('li', recursive=False)): li_inner = '' for child in e.children: li_inner = __print_tree(child, intent, li_inner) list_text += ' ' * intent + line_head.format(i + 1) + li_inner.lstrip() + '\n' md += __fill_newline_if_need(ele, list_text) if list_text.strip() != '' else '' return md def __transform_soup(ele, md, intent): for child in ele.children: md = __print_tree(child, intent, md) return md def __fill_newline_if_need(ele, text): if ele.next_sibling and ele.next_sibling.name in inline_map['normal'].keys() \ or isinstance(ele.next_sibling, NavigableString) and ele.next_sibling.string.strip() != '': text += '\n' if ele.previous_sibling and ele.previous_sibling in inline_map['normal'].keys()\ or isinstance(ele.previous_sibling, NavigableString) and ele.previous_sibling.string.strip() != '': text = '\n' + text return text ######################################################################################### # 从 document rich 文档中提取 html 文本 def recursion_json_to_html(data): html_data = "" # 元素 if (("type" in data) and data["type"]): # 标签创建 if (("style" in data) and data["style"]): # html_data = html_data + "<" + data["type"] + " style=\"" + data["style"] + "\"" + ">"; pass else: # html_data = html_data + "<" + data["type"] + ">"; pass # 子元素 if ("children" in data): for children_item in data["children"]: tmp_data = recursion_json_to_html(children_item) html_data = html_data + tmp_data # 标签结束 # html_data = html_data + ""; # 内容 if (("text" in data) and data["text"]): html_data = html_data + data["text"] # 返回 return html_data # 从 document rich 文档中提取 html 文本 def json_to_html(json_str): html_data = "" data = json.loads(json_str) # 递归转换 html_data = recursion_json_to_html(data) # 返回 return html_data # 从 json 中提取 text def json_to_text(json_str): # 从 document rich 文档中提取 html 文本 html_content = json_to_html(json_str) # html 提取 text text_content = get_text_from_html(html_content) return text_content # AI Rich Editor 转 TXT 文本 def ai_rich_json_to_text(json_str): content = "" if (json_str): data = json.loads(json_str) if ("doc" == data.get("type", None)): for item_i in data.get("content"): tmp_item_i_content = item_i.get("content", None) if (tmp_item_i_content): for item_j in tmp_item_i_content: if ("text" == item_j.get("type", None)): tmp_text = item_j.get("text", "") content = content + tmp_text # 返回 return content # Excel --> markdown def excel_to_markdown(file_path): print(f'====================excel_to_markdown====1==================') md = MarkItDown() result = md.convert(file_path) print(result.text_content) return result.text_content def get_hotnew_url(url): yaowen_url = [] rebang_url = [] jishi_url = [] # 获取网页内容 html = url_to_raw_html(url) html_other = url_to_raw_html(url + "/scroll-news/news1.html") # 将 HTML 内容转化为 BeautifulSoup 对象 soup = bs(html, 'html.parser') soup_other = bs(html_other, 'html.parser') # 使用select()选择分页信息中的所有页码元素 yaowen_lists = soup.select('.news-left .ywjx-news-list ul li a') # 要闻 rebang_lists = soup.select('.news-left .rdph-list.rdph-list2 ul li a') # 中新热榜 jishi_lists = soup_other.select('.content-left .content_list ul li .dd_bt a') # 即时新闻 # 要闻 for item in yaowen_lists: tmp_url = item["href"] if ("://" in tmp_url): pass elif ("//" in tmp_url): tmp_url = "https:" + tmp_url else: tmp_url = url + tmp_url # 添加 yaowen_url.append(tmp_url) # 中新热榜 for item in rebang_lists: tmp_url = item["href"] if ("://" in tmp_url): pass elif ("//" in tmp_url): tmp_url = "https:" + tmp_url else: tmp_url = url + tmp_url # 添加 rebang_url.append(tmp_url) # 即时新闻 for item in jishi_lists: tmp_url = item["href"] if ("://" in tmp_url): pass elif ("//" in tmp_url): tmp_url = "https:" + tmp_url else: tmp_url = url + tmp_url # 添加 jishi_url.append(tmp_url) # print(yaowen_url) # print(f'----------------------------------------------------') # print(rebang_url) # print(f'----------------------------------------------------') # print(jishi_url) return (yaowen_url, rebang_url, jishi_url) def get_hotnews_content(url): title = "" datetime = "" content = "" # 获取网页内容 html = url_to_raw_html(url) # 将 HTML 内容转化为 BeautifulSoup 对象 soup = bs(html, 'html.parser') # 使用select()选择分页信息中的所有页码元素 title_element = soup.select('.content .content_maincontent_more h1') # 标题 datetime_element = soup.select('.content .content_maincontent_more .content_left_time') # 日期+时间 content_element = soup.select('.content .content_maincontent_more .content_maincontent_content .left_zw') # 内容 # 获取标题 for item in title_element: title = item.text if item.text else "" # 获取日期时间(2025-04-08 02:55) for item in datetime_element: # item.text --> 2025年04月07日 23:55 来源: datetime = item.contents[0] if (item.contents and (0