当前位置: 首页 > news >正文

pdf、docx、markdown、txt提取文档内容,可以应用于rag文档解析

返回的是文档解析分段内容组成的列表,分段内容默认chunk_size: int = 250, chunk_overlap: int = 50,250字分段,50分段处保留后面一段的前50字拼接即窗口包含下下一段前面50个字划分

from typing import Union, Listimport jieba
import reclass SentenceSplitter:def __init__(self, chunk_size: int = 250, chunk_overlap: int = 50):self.chunk_size = chunk_sizeself.chunk_overlap = chunk_overlapdef split_text(self, text: str) -> List[str]:if self._is_has_chinese(text):return self._split_chinese_text(text)else:return self._split_english_text(text)def _split_chinese_text(self, text: str) -> List[str]:sentence_endings = {'\n', '。', '!', '?', ';', '…'}  # 句末标点符号chunks, current_chunk = [], ''for word in jieba.cut(text):if len(current_chunk) + len(word) > self.chunk_size:chunks.append(current_chunk.strip())current_chunk = wordelse:current_chunk += wordif word[-1] in sentence_endings and len(current_chunk) > self.chunk_size - self.chunk_overlap:chunks.append(current_chunk.strip())current_chunk = ''if current_chunk:chunks.append(current_chunk.strip())if self.chunk_overlap > 0 and len(chunks) > 1:chunks = self._handle_overlap(chunks)return chunksdef _split_english_text(self, text: str) -> List[str]:# 使用正则表达式按句子分割英文文本sentences = re.split(r'(?<=[.!?])\s+', text.replace('\n', ' '))chunks, current_chunk = [], ''for sentence in sentences:if len(current_chunk) + len(sentence) <= self.chunk_size or not current_chunk:current_chunk += (' ' if current_chunk else '') + sentenceelse:chunks.append(current_chunk)current_chunk = sentenceif current_chunk:  # Add the last chunkchunks.append(current_chunk)if self.chunk_overlap > 0 and len(chunks) > 1:chunks = self._handle_overlap(chunks)return chunksdef _is_has_chinese(self, text: str) -> bool:# check if contains chinese charactersif any("\u4e00" <= ch <= "\u9fff" for ch in text):return Trueelse:return Falsedef _handle_overlap(self, chunks: List[str]) -> List[str]:# 处理块间重叠overlapped_chunks = []for i in range(len(chunks) - 1):chunk = chunks[i] + ' ' + chunks[i + 1][:self.chunk_overlap]overlapped_chunks.append(chunk.strip())overlapped_chunks.append(chunks[-1])return overlapped_chunkstext_splitter = SentenceSplitter()def load_file(filepath):print("filepath:",filepath)if filepath.endswith(".md"):contents = extract_text_from_markdown(filepath)elif filepath.endswith(".pdf"):contents = extract_text_from_pdf(filepath)elif filepath.endswith('.docx'):contents = extract_text_from_docx(filepath)else:contents = extract_text_from_txt(filepath)return contentsdef extract_text_from_pdf(file_path: str):"""Extract text content from a PDF file."""import PyPDF2contents = []with open(file_path, 'rb') as f:pdf_reader = PyPDF2.PdfReader(f)for page in pdf_reader.pages:page_text = page.extract_text().strip()raw_text = [text.strip() for text in page_text.splitlines() if text.strip()]new_text = ''for text in raw_text:new_text += textif text[-1] in ['.', '!', '?', '。', '!', '?', '…', ';', ';', ':', ':', '”', '’', ')', '】', '》', '」','』', '〕', '〉', '》', '〗', '〞', '〟', '»', '"', "'", ')', ']', '}']:contents.append(new_text)new_text = ''if new_text:contents.append(new_text)return contentsdef extract_text_from_txt(file_path: str):"""Extract text content from a TXT file."""with open(file_path, 'r', encoding='utf-8') as f:contents = [text.strip() for text in f.readlines() if text.strip()]return contentsdef extract_text_from_docx(file_path: str):"""Extract text content from a DOCX file."""import docxdocument = docx.Document(file_path)contents = [paragraph.text.strip() for paragraph in document.paragraphs if paragraph.text.strip()]return contentsdef extract_text_from_markdown(file_path: str):"""Extract text content from a Markdown file."""import markdownfrom bs4 import BeautifulSoupwith open(file_path, 'r', encoding='utf-8') as f:markdown_text = f.read()html = markdown.markdown(markdown_text)soup = BeautifulSoup(html, 'html.parser')contents = [text.strip() for text in soup.get_text().splitlines() if text.strip()]return contentstexts = load_file(r"C:\Users\lo***山市城市建筑外立面管理条例.docx")
print(texts)

在这里插入图片描述

http://www.lryc.cn/news/333818.html

相关文章:

  • 【Linux系列】“dev-node1“ 运行的操作系统分析
  • SpriingBoot整合MongoDB多数据源
  • 深入浅出 -- 系统架构之负载均衡Nginx缓存机制
  • 前端 小程序框架UniApp
  • 宏集PLC如何为楼宇自动化行业提供空调、供暖与通风的解决方案?
  • 【TI毫米波雷达】官方工业雷达包的生命体征检测环境配置及避坑(Vital_Signs、IWR6843AOPEVM)
  • 计算机毕业设计选题之基于SSM的旅游管理系统【源码+PPT+文档+包运行成功+部署讲解】
  • JavaWeb入门——Web前端概述及HTML,CSS语言基本使用
  • 数据结构(3)----栈和队列
  • nestjs 全栈进阶--module
  • jupyter python paramiko 网络系统运维
  • Windows Edge浏览器兼容性问题诊断与修复策略详解
  • EXCEL学习笔记
  • 使用预训练的bert large model实现问答系统源码(本地实现 question answer system)
  • 蓝桥杯 历届真题 杨辉三角形【第十二届】【省赛】【C组】
  • 商务电子邮件: 在WorkPlace中高效且安全
  • 阿里云2024年优惠券领取及使用常见问题
  • 90天玩转Python—05—基础知识篇:Python基础知识扫盲,使用方法与注意事项
  • 常见的常见免费开源绘图工具对比 draw.io/Excalidraw/Lucidchart/yEd Graph Editor/Dia/
  • 项目:自主实现Boost搜索引擎
  • 麒麟系统ARM安装rabbitmq
  • MongoDB数据更新大之大与小中小
  • C语言开发实战:使用EasyX在Visual Studio 2022中创建井字棋游戏
  • Android与RN远程过程调用的原理
  • MySQL-主从复制:概述、原理、同步数据一致性问题、搭建流程
  • 论文阅读《Semantic Prompt for Few-Shot Image Recognition》
  • Linux初学(十七)docker
  • Python---Numpy线性代数
  • react+ echarts 轮播饼图
  • 政安晨:【深度学习神经网络基础】(三)—— 激活函数