当前位置: 首页 > news >正文

Java实现word、pdf转html保留格式

一、word转html

依赖:

<properties><poi.version>5.2.3</poi.version><xhtml.version>2.0.4</xhtml.version>
</properties><!--word转html-->
<dependency><groupId>org.apache.poi</groupId><artifactId>poi-scratchpad</artifactId><version>${poi.version}</version>
</dependency>
<!--word转html-->
<dependency><groupId>fr.opensagres.xdocreport</groupId><artifactId>fr.opensagres.poi.xwpf.converter.xhtml</artifactId><version>${xhtml.version}</version>
</dependency>
<!--处理office文档表格相关 2007+版-->
<dependency><groupId>org.apache.poi</groupId><artifactId>poi-ooxml</artifactId><version>${poi.version}</version>
</dependency>
<!--处理office文档表格相关 2003版-->
<dependency><groupId>org.apache.poi</groupId><artifactId>poi</artifactId><version>${poi.version}</version>
</dependency>

代码:

import fr.opensagres.poi.xwpf.converter.xhtml.Base64EmbedImgManager;
import fr.opensagres.poi.xwpf.converter.xhtml.XHTMLConverter;
import fr.opensagres.poi.xwpf.converter.xhtml.XHTMLOptions;
import org.apache.commons.codec.binary.Base64;
import org.apache.poi.hwpf.HWPFDocument;
import org.apache.poi.hwpf.converter.WordToHtmlConverter;
import org.apache.poi.xwpf.usermodel.XWPFDocument;import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.transform.OutputKeys;
import javax.xml.transform.Transformer;
import javax.xml.transform.TransformerFactory;
import javax.xml.transform.dom.DOMSource;
import javax.xml.transform.stream.StreamResult;
import java.io.*;
import java.net.URL;public class WordUtil {public static String wordToHtml(String fileUrl,String fileSuffix) throws Exception {URL url = new URL(fileUrl);try (InputStream inputStream = url.openStream()) {if(fileSuffix.equals(".docx") || fileSuffix.equals(".DOCX")){return word2007ToHtml(inputStream);} else if (fileSuffix.equals(".doc") || fileSuffix.equals(".DOC")) {return word2003ToHtml(inputStream);}else{throw new RuntimeException("错误的文件后缀");}} catch (RuntimeException e) {throw new RuntimeException(e.getMessage());}}/*** word2007转换成html* 对于docx,可以用下面这种方式:* @throws Exception*/public static String word2007ToHtml(InputStream inputStream) {try (ByteArrayOutputStream htmlStream = new ByteArrayOutputStream();XWPFDocument docxDocument = new XWPFDocument(inputStream)) {XHTMLOptions options = XHTMLOptions.create();// 是否忽略未使用的样式options.setIgnoreStylesIfUnused(false);// 设置片段模式,<div>标签包裹options.setFragment(true);// 图片转base64options.setImageManager(new Base64EmbedImgManager());// 转换htm1XHTMLConverter.getInstance().convert(docxDocument, htmlStream, options);return htmlStream.toString();} catch (Exception e) {System.out.println("Word转Html过程出现异常!");throw new RuntimeException(e.getMessage());}}/*** word2003转换成html* 对于doc,可以用下面这种方式:* @throws Exception*/public static String word2003ToHtml(InputStream inputStream ) throws Exception {try (StringWriter writer = new StringWriter();HWPFDocument document = new HWPFDocument(inputStream)) {WordToHtmlConverter wordToHtmlConverter = new WordToHtmlConverter(DocumentBuilderFactory.newInstance().newDocumentBuilder().newDocument());//将图片转成base64的格式wordToHtmlConverter.setPicturesManager((bytes, pictureType, s, v, v1) -> "data:image/png;base64," + Base64.encodeBase64String(bytes));wordToHtmlConverter.processDocument(document);org.w3c.dom.Document htmlDocument = wordToHtmlConverter.getDocument();DOMSource domSource = new DOMSource(htmlDocument);TransformerFactory factory = TransformerFactory.newInstance();Transformer serializer = factory.newTransformer();serializer.setOutputProperty(OutputKeys.ENCODING, "utf-8");serializer.setOutputProperty(OutputKeys.INDENT, "yes");serializer.setOutputProperty(OutputKeys.METHOD, "html");serializer.transform(domSource, new StreamResult(writer));return writer.toString();} catch (Exception e) {System.out.println("Word转Html过程出现异常!");throw new RuntimeException(e.getMessage());}}}

来源博客:Java实现word转html_java word转html-CSDN博客

二、pdf转html

依赖:

        <dependency><groupId>net.sf.cssbox</groupId><artifactId>pdf2dom</artifactId></dependency><dependency><groupId>net.mabboud.fontverter</groupId><artifactId>FontVerter</artifactId></dependency><dependency><groupId>org.reflections</groupId><artifactId>reflections</artifactId></dependency><!--pdf转文本--><dependency><groupId>org.apache.pdfbox</groupId><artifactId>pdfbox</artifactId></dependency>

 代码:

import org.apache.pdfbox.pdmodel.PDDocument;
import org.fit.pdfdom.PDFDomTree;import java.io.*;
import java.net.URL;public class PDFUtil {public static String pdfToHtml(String fileUrl) throws IOException {URL url = new URL(fileUrl);try (InputStream inputStream = url.openStream()){return pdfToHtml(inputStream);}catch (Exception e){throw new IOException(e.getMessage());}}public static String pdfToHtml(InputStream inputStream) throws IOException {String outFilePath = "mypdf.html";String pdfContent = "";PDDocument document = PDDocument.load(inputStream);Writer writer = new PrintWriter(outFilePath, "UTF-8");new PDFDomTree().writeText(document, writer);writer.close();document.close();// 获取html内容try (BufferedReader reader = new BufferedReader(new FileReader(outFilePath))) {StringBuilder htmlContent = new StringBuilder();String line;while ((line = reader.readLine()) != null) {htmlContent.append(line).append("\n"); // 追加每一行内容,并添加换行符}pdfContent = String.valueOf(htmlContent);return pdfContent;} catch (IOException e) {e.printStackTrace();System.err.println("读取 HTML 文件时出错。");}return null;}
}

 来源博客:使用Java实现PDF到HTML的转换_java pdf转html-CSDN博客

http://www.lryc.cn/news/588675.html

相关文章:

  • JavaScript与Vue:现代前端开发的完美组合
  • Spark Expression codegen
  • Swift实现股票图:从基础到高级
  • 线程(一) linux
  • 使用Dify+fastmcp 实现mcp服务,内含详细步骤与源码
  • Mac IDEA启动报错:Error occurred during initialization of VM
  • Twisted study notes[1]
  • [附源码+数据库+毕业论文+开题报告]基于Spring+MyBatis+MySQL+Maven+jsp实现的车辆运输管理系统,推荐!
  • etcd自动压缩清理
  • easy-ui中的相对路径和绝对路径问题
  • 现代CSS实战:用变量与嵌套重构可维护的前端样式
  • 【GPIO】从STM32F103入门GPIO寄存器
  • 腿姐政治笔记唯物辩证法(2)(12356)
  • 面试遇到的问题
  • 使用JS编写用户信息采集表单
  • 利用android studio,对图片资源进行二次压缩
  • 网络编程-epoll模型/udp通信
  • Node.js 中http 和 http/2 是两个不同模块对比
  • AutoGPT vs BabyAGI:自主任务执行框架对比与选型深度分析
  • python的形成性考核管理系统
  • 1.easypan-登录注册
  • P3842 [TJOI2007] 线段
  • 基于多智能体强化学习的医疗检索增强生成系统研究—MMOA-RAG架构设计与实现
  • 编程技能:多文件编译
  • c++图形题练习程序
  • LVS三种模式实战
  • 图机器学习(6)——图自编码器
  • 【电脑】显卡(GPU)的基础知识
  • 【轨物方案】当补贴退潮,光伏电站如何回归价值本质?
  • MySQL数据库----函数