当前位置：首页 > news >正文

CSDN文章保存为MD文档（一）

news 2025/8/12 5:38:36

免责声明

文章仅做经验分享用途，利用本文章所提供的信息而造成的任何直接或者间接的后果及损失，均由使用者本人负责，作者不为此承担任何责任，一旦造成后果请自行承担！！！

import os
import re
import sys
import requests
sys.path.append("")
from os.path import join, exists
from urllib.request import urlretrieve
from bs4 import BeautifulSoup, Tag, NavigableString, Comment

class Parser(object):
def __init__(self, html,markdown_dir):
self.html = html
self.soup = BeautifulSoup(html, 'html.parser')
self.outputs = []
self.fig_dir = markdown_dir
self.pre = False
self.equ_inline = False

if not exists(self.fig_dir):
os.makedirs(self.fig_dir)
self.recursive(self.soup)

def remove_comment(self, soup):
if not hasattr(soup, 'children'): return
for c in soup.children:
if isinstance(c, Comment):
c.extract()
self.remove_comment(c)

def recursive(self, soup):
if isinstance(soup, Comment): return
elif isinstance(soup, NavigableString):
for key, val in special_characters.items():
soup.string = soup.string.replace(key, val)
self.outputs.append(soup.string)
elif isinstance(soup, Tag):
tag = soup.name
if tag in ['h1', 'h2', 'h3', 'h4', 'h5']:
n = int(tag[1])
soup.contents.insert(0, NavigableString('\n' + '#'*n + ' '))
soup.contents.append(NavigableString('\n'))
elif tag == 'a' and 'href' in soup.attrs:
soup.contents.insert(0, NavigableString('['))
soup.contents.append(NavigableString("]({})".format(soup.attrs['href'])))
elif tag in ['b', 'strong']:
soup.contents.insert(0, NavigableString('**'))
soup.contents.append(NavigableString('**'))
elif tag in ['em']:
soup.contents.insert(0, NavigableString('*'))
soup.contents.append(NavigableString('*'))
elif tag == 'pre':
self.pre = True
elif tag in ['code', 'tt']:
if self.pre:
if not 'class' in soup.attrs:
language = 'bash'
else:
for name in ['cpp', 'bash', 'python', 'java']:
if name in ' '.join(list(soup.attrs['class'])): # <code class="prism language-cpp">
language = name
soup.contents.insert(0, NavigableString('\n```{}\n'.format(language)))
soup.contents.append(NavigableString('```\n'))
self.pre = False # assume the contents of <pre> contain only one <code>
else:
soup.contents.insert(0, NavigableString('`'))
soup.contents.append(NavigableString('`'))
elif tag == 'p':
if soup.parent.name != 'li':
# print(soup.parent)
soup.contents.insert(0, NavigableString('\n'))
elif tag == 'span':
if 'class' in soup.attrs:
if ('katex--inline' in soup.attrs['class'] or
'katex--display' in soup.attrs['class']): ## inline math
self.equ_inline = True if 'katex--inline' in soup.attrs['class'] else False
math_start_sign = '$' if self.equ_inline else '\n\n$$'
math_end_sign = '$' if self.equ_inline else '$$\n\n'
equation = soup.find_all('annotation', {'encoding': 'application/x-tex'})[0].string
equation = math_start_sign + str(equation) + math_end_sign
self.outputs.append(equation)
self.equ_inline = False
return
elif tag in ['ol', 'ul']:
soup.contents.insert(0, NavigableString('\n'))
soup.contents.append(NavigableString('\n'))
elif tag in ['li']:
soup.contents.insert(0, NavigableString('+ '))
elif tag == 'img':
src = soup.attrs['src']
# pattern = r'.*\.png'
pattern = r'(.*\..*\?)|(.*\.(png|jpeg|jpg))'
result_tuple = re.findall(pattern, src)[0]
if result_tuple[0]:
img_file = result_tuple[0].split('/')[-1].rstrip('?')
else:
img_file = result_tuple[1].split('/')[-1].rstrip('?')
img_file = join(self.fig_dir, img_file)
urlretrieve(src, img_file)
code = '![{}]({})'.format(img_file, img_file)
self.outputs.append('\n' + code + '\n')
return
if not hasattr(soup, 'children'): return
for child in soup.children:
self.recursive(child)

def html2md(url, md_file, with_title=False):
response = requests.get(url,headers = header)
soup = BeautifulSoup(response.content, 'html.parser', from_encoding="utf-8")
html = ""
for child in soup.find_all('svg'):
child.extract()
if with_title:
for c in soup.find_all('div', {'class': 'article-title-box'}):
html += str(c)
for c in soup.find_all('div', {'id': 'content_views'}):
html += str(c)

parser = Parser(html,markdown_dir)
with open(md_file, 'w',encoding="utf-8") as f:
f.write('{}\n'.format(''.join(parser.outputs)))

def download_csdn_single_page(article_url, md_dir, with_title=True, pdf_dir='pdf', to_pdf=False):
response = requests.get(article_url,headers = header)
soup = BeautifulSoup(response.content, 'html.parser', from_encoding="utf-8")
title = soup.find_all('h1', {'class': 'title-article'})[0].string ## 使用 html 的 title 作为 md 文件名
title = title.replace('*', '').strip().split()
md_file = md_dir+'/'+title[0] + '.md'
print('正在保存 Markdown File To {}'.format(md_file))
html2md(article_url, md_file, with_title=with_title)

header = {
"Accept": "application/json, text/plain, */*",
"Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8,und;q=0.7",
"Connection": "keep-alive",
"Content-Type": "application/json;charset=UTF-8",
"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) Safari/537.36",
"x-requested-with": "XMLHttpRequest"
}

special_characters = {
"<": "<", ">": ">", "&nbsp": " ",
"&#8203": "",
}

if __name__ == '__main__':
article_url = input(str("输入需要保存的csdn博客链接：")) #csdn博客链接
markdown_dir = './' #保存文件夹
download_csdn_single_page(article_url,markdown_dir)