当前位置：首页 > news >正文

文档：htm格式转txt

news 2025/8/27 10:51:06

꧂ 两个地方都保存꧁

import os
import codecs
from bs4 import BeautifulSoupdef generate_output_filename(file_path, save_path):# 获取文件名（不包含扩展名）file_name = os.path.splitext(os.path.basename(file_path))[0]# 构造保存路径和文件名output_filename = os.path.join(save_path, file_name + '.txt')return output_filenamedef get_content_from_mht(soup):# 从 MHT 文件中提取内容，并返回字符串形式的内容# 这里只是示例，您可以根据具体的 MHT 文件结构进行修改# 下面的示例代码仅提取 <body> 标签下的文本内容body = soup.bodyif body:return body.get_text()else:return ""def convert_mht_to_txt(path, save_path_1, save_path_2):if os.path.isdir(path):for root, dirs, files in os.walk(path):for file in files:if file.endswith('.mht'):file_path = os.path.join(root, file)output_filename_1 = generate_output_filename(file_path, save_path_1)output_filename_2 = generate_output_filename(file_path, save_path_2)with codecs.open(output_filename_1, 'w', 'utf-8') as f_out_1, \codecs.open(output_filename_2, 'w', 'utf-8') as f_out_2:with open(file_path, 'r', encoding='utf-8') as f_in:soup = BeautifulSoup(f_in, 'html.parser')content = get_content_from_mht(soup)f_out_1.write(content)f_out_2.write(content)elif os.path.isfile(path) and path.endswith('.mht'):output_filename_1 = generate_output_filename(path, save_path_1)output_filename_2 = generate_output_filename(path, save_path_2)with codecs.open(output_filename_1, 'w', 'utf-8') as f_out_1, \codecs.open(output_filename_2, 'w', 'utf-8') as f_out_2:with open(path, 'r', encoding='utf-8') as f_in:soup = BeautifulSoup(f_in, 'html.parser')content = get_content_from_mht(soup)f_out_1.write(content)f_out_2.write(content)# 示例用法
path = input("请输入要处理的文件路径：")
save_path_1 = input("请输入第一个保存文件的位置（按回车默认保存）：")
save_path_2 = input("请输入第二个保存文件的位置（按回车默认不保存）：")if save_path_1 == '':save_path_1 = '/storage/emulated/0/文件/代码print相关/结构图/字符图/'if save_path_2 == '':convert_mht_to_txt(path, save_path_1, save_path_1)
else:convert_mht_to_txt(path, save_path_1, save_path_2)

仅适用于处理 .mht 格式的文件。

꧂添加转义成文字 ꧁

如果您想要将转义字符转换为相应的文字形式，可以借助 Python 的内置模块 html。以下是修改后的代码：

import os
import codecs
from bs4 import BeautifulSoup
import htmldef generate_output_filename(file_path, save_path):# 获取文件名（不包含扩展名）file_name = os.path.splitext(os.path.basename(file_path))[0]# 构造保存路径和文件名output_filename = os.path.join(save_path, file_name + '.txt')return output_filenamedef get_content_from_mht(soup):# 从 MHT 文件中提取内容，并返回字符串形式的内容# 这里只是示例，您可以根据具体的 MHT 文件结构进行修改# 下面的示例代码仅提取 <body> 标签下的文本内容body = soup.bodyif body:content = body.get_text()# 转义字符转换为相应的文字形式content = html.unescape(content)return contentelse:return ""def convert_mht_to_txt(path, save_path_1, save_path_2):if os.path.isdir(path):for root, dirs, files in os.walk(path):for file in files:if file.endswith('.mht'):file_path = os.path.join(root, file)output_filename_1 = generate_output_filename(file_path, save_path_1)output_filename_2 = generate_output_filename(file_path, save_path_2)with codecs.open(output_filename_1, 'w', 'utf-8') as f_out_1, \codecs.open(output_filename_2, 'w', 'utf-8') as f_out_2:with open(file_path, 'r', encoding='utf-8') as f_in:soup = BeautifulSoup(f_in, 'html.parser')content = get_content_from_mht(soup)f_out_1.write(content)f_out_2.write(content)elif os.path.isfile(path) and path.endswith('.mht'):output_filename_1 = generate_output_filename(path, save_path_1)output_filename_2 = generate_output_filename(path, save_path_2)with codecs.open(output_filename_1, 'w', 'utf-8') as f_out_1, \codecs.open(output_filename_2, 'w', 'utf-8') as f_out_2:with open(path, 'r', encoding='utf-8') as f_in:soup = BeautifulSoup(f_in, 'html.parser')content = get_content_from_mht(soup)f_out_1.write(content)f_out_2.write(content)# 示例用法
path = input("请输入要处理的文件路径：")
save_path_1 = input("请输入第一个保存文件的位置（按回车默认保存）：")
save_path_2 = input("请输入第二个保存文件的位置（按回车默认不保存）：")if save_path_1 == '':save_path_1 = '/storage/emulated/0/文件/代码print相关/结构图/字符图/'if save_path_2 == '':convert_mht_to_txt(path, save_path_1, save_path_1)
else:convert_mht_to_txt(path, save_path_1, save_path_2)