当前位置: 首页 > news >正文

pdf 转html 在线预览和查询

方案一:
pdf2htmlex
package com.realize.controller;import cn.hutool.http.HttpUtil;
import com.alibaba.fastjson2.JSONObject;
import com.realize.util.MsgUtil;
import com.realize.util.OssUtil;
import com.realize.util.PdfConvertUtil;
import com.realize.util.StreamGobbler;
import lombok.extern.slf4j.Slf4j;
import org.springframework.web.bind.annotation.GetMapping;
import org.springframework.web.bind.annotation.ModelAttribute;
import org.springframework.web.bind.annotation.PostMapping;
import org.springframework.web.bind.annotation.RestController;import java.io.*;
import java.nio.charset.Charset;
import java.nio.file.Files;
import java.nio.file.Paths;
import java.util.List;@RestController
@Slf4j
public class ParserController {@GetMapping("/test")public String test() {return "test";}//    @PostMapping("/parseHtml")
//    public JSONObject parseHtml(@ModelAttribute("htmlUrl") String htmlUrl) {
//        try (Playwright playwright = Playwright.create()) {
//            Browser browser = playwright.chromium().launch(new BrowserType.LaunchOptions().setHeadless(true));
//            Page page = browser.newPage();
//            String filePath = "/mnt/temp/html/" + RandomUtil.randomString(10) + ".html";String filePath = "/Users/sunyechen/IdeaProjects/realize-nacos/bin/" + RandomUtil.randomString(10) + ".html";
//            HttpUtil.downloadFile(htmlUrl, filePath);
//            page.navigate("file:" + filePath);
//            page.evaluate("var imgList=document.getElementsByTagName('img');" +
//                    "for(var i=0;i<imgList.length;i++){" +
//                    "var src=imgList[i].getAttribute('src');" +
//                    "imgList[i].setAttribute('src','https://realizedongmi.oss-cn-shanghai.aliyuncs.com/a-filings/test/'+src);" +
//                    "}");
//            JSONObject result = new JSONObject();
//            result.put("html", page.innerHTML("css=body"));
//            result.put("css", page.innerHTML("css=style"));
//            result.put("txt", page.innerText("css=body").trim().replaceAll("\n", ""));
//            page.close();
//            browser.close();
//            return result;
//        } catch (Exception e) {
//            e.printStackTrace();
//        }
//        return null;
//    }@GetMapping("/batchConvertPdf")public String batchConvertPdf() {String folderName = "/root/pdf";File folder = new File(folderName);File[] files = folder.listFiles();for (int i = 0; i < files.length; i++) {String fileName = files[i].getName();if (fileName.toLowerCase().endsWith(".pdf")) {String exec = " docker run -i --rm -v /root/pdf:/pdf -w /pdf docker.io/pdf2htmlex/pdf2htmlex:0.18.8.rc2-master-20200820-ubuntu-20.04-x86_64 --font-size-multiplier 1 --zoom 1.3 " + fileName;try {
//                    Process process = new ProcessBuilder("/bin/sh", "-c", exec).start();
//                    String result = IOUtils.toString(process.getInputStream(), "utf-8");
//                    log.info("Executing Command [result]:{}", result);
//                    Runtime rt = Runtime.getRuntime();log.info("exec:{}", exec);
//                    String[] execArray = new String[]{"/bin/sh", "-c", " docker run -i --rm -v /root/pdf:/pdf -w /pdf docker.io/pdf2htmlex/pdf2htmlex:0.18.8.rc2-master-20200820-ubuntu-20.04-x86_64 --font-size-multiplier 1 --zoom 1.3 ", fileName};Process process = Runtime.getRuntime().exec(exec);StreamGobbler errorGobbler = new StreamGobbler(process.getErrorStream(), "ERROR");// 开启屏幕标准错误流errorGobbler.start();StreamGobbler outGobbler = new StreamGobbler(process.getInputStream(), "OUTPUT");// 开启屏幕标准输出流outGobbler.start();int w = process.waitFor();int v = process.exitValue();if (w == 0 && v == 0) {log.info("转换成功:{}", fileName);} else {log.info("转换失败:{}", fileName);}} catch (Exception e) {log.error("{}", e);return null;}}}return "ok";}@PostMapping("/convertAndParsePdf")public JSONObject convertAndParsePdf(@ModelAttribute("pdfUrl") String pdfUrl, @ModelAttribute("ossKey") String ossKey) throws Exception {log.info("接收到转换请求{},{}", pdfUrl, ossKey);String fileFolder = "/mnt_real/pdf/";JSONObject result = new JSONObject();String tempFileName = ossKey.substring(ossKey.lastIndexOf("/") + 1, ossKey.lastIndexOf("."));String pdfFilePath = fileFolder + tempFileName + ".pdf";String htmlFilePath = fileFolder + tempFileName + ".html";HttpUtil.downloadFile(pdfUrl, pdfFilePath);log.info("pdf文件下载成功{},{}", pdfUrl, ossKey);//解析pdf正文
//        String pdfText = PdfBoxUtil.getPdfText(pdfFilePath);
//        result.put("pdfText", pdfText);
//        log.info("pdfbox正文解析成功{},{}", pdfUrl, ossKey);//解析Boolean convertResult = PdfConvertUtil.convertPdf(tempFileName + ".pdf");if (convertResult) {try {List<String> allLines = Files.readAllLines(Paths.get(htmlFilePath), Charset.forName("UTF-8"));String content = String.join("\n", allLines);
//                File file = new File(htmlFilePath);
//                BufferedReader reader = new BufferedReader(new FileReader(file));
//                String line = "", oldContent = "";
//                while ((line = reader.readLine()) != null) {
//                    oldContent += line + "\n";
//                }
//                reader.close();content = content.replaceAll("github", "zzz").replaceAll("pdf2htmlEX", "tg").replaceAll("<meta charset=\"utf-8\"/>", "<meta charset=\"utf-8\"/><script src=\"https://oss.imvib.com/a-filings/test/test/search.js\" type=\"text/javascript\" charset=\"utf-8\"></script> ");File file = new File(htmlFilePath);file.delete();FileWriter writer = new FileWriter(htmlFilePath);writer.write(content);writer.close();log.info("html文件处理完成{},{}", pdfUrl, ossKey);result.put("code", 0);} catch (IOException e) {e.printStackTrace();result.put("code", -1);} finally {//上传所有文件String ossPath = ossKey.substring(0, ossKey.lastIndexOf("/") + 1);OssUtil.batchFileUploadOssUrl(fileFolder, ossPath);log.info("文件上传成功,完整链接:https://oss.imvib.com/{}", ossKey.replace(".html", ".pdf"));}} else {MsgUtil.sendDingTalkMsg(pdfUrl);result.put("code", -1);}return result;}private static byte[] readAllBytes(File file) throws IOException {try (FileInputStream fileInputStream = new FileInputStream(file)) {byte[] buffer = new byte[(int) file.length()];fileInputStream.read(buffer);return buffer;}}public static void main(String[] args) throws Exception {
//        String htmlFilePath = "/Users/sunyechen/doc/test/矩阵股份:长江证券承销保荐有限公司关于矩阵纵横设计股份有限公司使用募集资金置换预先投入募投项目及已支付发行费用的自筹资金的核查意见.html";
//        File htmlFile = new File(htmlFilePath);
//        Document html = Jsoup.parse(htmlFile);Element script = html.select("script").first();String sourceScript = script.html();script.html(sourceScript + PdfConvertUtil.addScript);
//        FileOutputStream fos = new FileOutputStream(htmlFilePath.replace(".html", "_.html"), false);
//        OutputStreamWriter osw = new OutputStreamWriter(fos, "UTF-8");
//        osw.write(html.outerHtml());
//        osw.close();
//        try (Playwright playwright = Playwright.create()) {
//            Browser browser = playwright.chromium().launch(new BrowserType.LaunchOptions().setHeadless(true));
//            BrowserContext context = browser.newContext(new Browser.NewContextOptions());
//            Page page = browser.newPage();
//            String htmlUrl = "https://realizedongmi.oss-cn-shanghai.aliyuncs.com/a-filings/test/2023-01-03%201%20%E5%8F%91%E8%A1%8C%E4%BA%BA%E5%8F%8A%E4%BF%9D%E8%8D%90%E6%9C%BA%E6%9E%84%E5%85%B3%E4%BA%8E%E4%BA%8C%E8%BD%AE%E5%AE%A1%E6%A0%B8%E9%97%AE%E8%AF%A2%E5%87%BD%E7%9A%84%E5%9B%9E%E5%A4%8D%EF%BC%88%E4%BF%AE%E8%AE%A2%E7%A8%BF%EF%BC%89_%E6%98%93%E7%91%9E%E7%94%9F%E7%89%A9.htm";
//            HttpUtil.downloadFile(htmlUrl, "/Users/sunyechen/IdeaProjects/realize-nacos/bin/1.html");
//            page.navigate("file:/Users/sunyechen/IdeaProjects/realize-nacos/bin/1.html");
//            System.out.println(page.innerHTML("body"));
//        }
//        System.out.println(URLDecoder.decode("https://oss.imvib.com/a-filings%252Foriginal%252F000586%252F2023-04-07+%25E5%25B9%25B4%25E5%25BA%25A6%25E5%2585%25B3%25E8%2581%2594%25E6%2596%25B9%25E8%25B5%2584%25E9%2587%2591%25E5%258D%25A0%25E7%2594%25A8%25E4%25B8%2593%25E9%25A1%25B9%25E5%25AE%25A1%25E8%25AE%25A1%25E6%258A%25A5%25E5%2591%258A.PDF", "UTF-8"));
//        try {
//            File file = new File("/Users/sunyechen/doc/test/矩阵股份:长江证券承销保荐有限公司关于矩阵纵横设计股份有限公司使用募集资金置换预先投入募投项目及已支付发行费用的自筹资金的核查意见.html");
//            BufferedReader reader = new BufferedReader(new FileReader(file));
//            String line = "", oldContent = "";
//            while ((line = reader.readLine()) != null) {
//                oldContent += line + "\n";
//            }
//            reader.close();
//            String newContent = oldContent.replaceAll("<meta charset=\"utf-8\"/>", "<meta charset=\"utf-8\"/><script src=\"https://oss.imvib.com/a-filings/test/test/search.js\" type=\"text/javascript\" charset=\"utf-8\"></script> ");
//            FileWriter writer = new FileWriter(new File("/Users/sunyechen/doc/test/矩阵股份:长江证券承销保荐有限公司关于矩阵纵横设计股份有限公司使用募集资金置换预先投入募投项目及已支付发行费用的自筹资金的核查意见_1.html"));
//            writer.write(newContent);
//            writer.close();
//            System.out.println("File updated successfully.");
//        } catch (IOException e) {
//            e.printStackTrace();
//        }
//        log.info("start");
//        String htmlFilePath = "/Users/sunyechen/sfit/7745c98a5ba34525937bce19519c0b1e.html";
//        try {
//            File file = new File(htmlFilePath);
//            BufferedReader reader = new BufferedReader(new FileReader(file));
//            String line = "", oldContent = "";
//            while ((line = reader.readLine()) != null) {
//                oldContent += line + "\n";
//            }
//            reader.close();
//            String newContent = oldContent.replaceAll("github", "zzz").replaceAll("pdf2htmlEX", "tanqiuhuashigou").replaceAll("<meta charset=\"utf-8\"/>", "<meta charset=\"utf-8\"/><script src=\"https://oss.imvib.com/a-filings/test/test/search.js\" type=\"text/javascript\" charset=\"utf-8\"></script> ");
//            String newHtmlFilePath = htmlFilePath.replace(".html", "_.html");
//            FileWriter writer = new FileWriter(newHtmlFilePath);
//            writer.write(newContent);
//            writer.close();
//            log.info("html文件处理完成{},{}");
//
//        } catch (IOException e) {
//            e.printStackTrace();
//        }log.info("start");
//        String ossKey = "ann/688249/2023/4/688249_20230412_9XYK/688249_20230412_9XYK.html";
//        String ossPath = ossKey.substring(0, ossKey.lastIndexOf("/") + 1);
//        OssUtil.batchFileUploadOssUrl("/Users/sunyechen/sfit/test/", ossPath);
//        File[] fileList = new File("/Users/sunyechen/sfit/test/").listFiles();
//        for (int i = 0; i < fileList.length; i++) {
//            OssUtil.fileUploadOssUrl(fileList[i], ossPath + fileList[i].getName());
//            fileList[i].delete();
//        }String htmlFilePath = "/Users/sunyechen/sfit/test/600499_20230415_EVH7.html";List<String> allLines = Files.readAllLines(Paths.get(htmlFilePath), Charset.forName("UTF-8"));String content = String.join("\n", allLines);System.out.println(content);log.info("start");File file = new File(htmlFilePath);BufferedReader reader = new BufferedReader(new FileReader(file));String line = "", oldContent = "";while ((line = reader.readLine()) != null) {oldContent += line + "\n";}reader.close();System.out.println(oldContent);log.info("end");}}

方案二:

kkFileView-4.0.0

kkFileView - 在线文件预览

方案三:

wkhtmltox-0.12.6-1.centos7.x86_64.rpm

wkhtmltopdf 

http://www.lryc.cn/news/287408.html

相关文章:

  • docker 体验怀旧游戏(魂斗罗等)
  • JS中判断数据类型总结以及方法封装
  • 【Midjourney】绘画风格关键词
  • 教你如何低成本自建「幻兽帕鲁」服务器,快速一键部署
  • 拥抱社交电商浪潮:微信小程序商城崛起引领电商新风向-亿发
  • 一个使用pyqt的word文档查重工具
  • SpringCloud Alibaba Sentinel 与 SpringCloud Gateway 的限流有什么差别?(三种限流算法原理分析)
  • 邦芒忠告:职场新人最需要避开的十大雷坑
  • MySQL-进阶-索引
  • GitLab入门指南:上传与下载操作一网打尽
  • GPT应用_PrivateGPT
  • Qt‘s 撤销框架(Qt‘s Undo Framework)
  • 【C++】stack、queue的使用及模拟实现
  • 外包干了2个多月,技术退步明显。。。。。
  • html5实现好看的年会邀请函源码模板
  • 【C++】反向迭代器模拟实现
  • 【低照度图像增强系列(5)】Zero-DCE算法详解与代码实现(CVPR 2020)
  • 三维重建衡量指标记录
  • 在WinForms中控制模态对话框的关闭行为
  • java web mvc-02-struts2
  • 文件上传之大文件分块上传
  • 测试用例评审流程
  • 鸿蒙开发案列一
  • Vue实现图片预览,侧边栏懒加载,不用任何插件,简单好用
  • Spring依赖注入之setter注入与构造器注入以及applicationContext.xml配置文件特殊值处理
  • 碳排放预测 | Matlab实现LSTM多输入单输出未来碳排放预测,预测新数据
  • 手拉手JavaFX UI控件与springboot3+FX桌面开发
  • 02 分解质因子
  • 科技赋能智慧水利——山海鲸软件水利方案解析
  • C4.5决策树的基本建模流程