当前位置: 首页 > news >正文

Apache tika 实现各种文档内容解析

Apache tika 实现各种文档内容解析

1、依赖

<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0"xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"><modelVersion>4.0.0</modelVersion><groupId>cn.js</groupId><artifactId>TikaResouce</artifactId><version>1.0-SNAPSHOT</version><properties><maven.compiler.source>8</maven.compiler.source><maven.compiler.target>8</maven.compiler.target><project.build.sourceEncoding>UTF-8</project.build.sourceEncoding></properties><parent><groupId>org.springframework.boot</groupId><artifactId>spring-boot-starter-parent</artifactId><version>2.7.0</version></parent><dependencyManagement><dependencies><dependency><groupId>org.apache.tika</groupId><artifactId>tika-bom</artifactId><version>2.8.0</version><type>pom</type><scope>import</scope></dependency></dependencies></dependencyManagement><dependencies><dependency><groupId>org.springframework.boot</groupId><artifactId>spring-boot-starter-web</artifactId></dependency><dependency><groupId>commons-fileupload</groupId><artifactId>commons-fileupload</artifactId><version>1.4</version></dependency><dependency><groupId>org.apache.tika</groupId><artifactId>tika-core</artifactId></dependency><dependency><groupId>org.apache.tika</groupId><artifactId>tika-parsers-standard-package</artifactId></dependency></dependencies></project>

2、配置文件

新建一个 tika-config.xml 文件
<?xml version="1.0" encoding="UTF-8"?>
<properties><encodingDetectors><encodingDetector class="org.apache.tika.parser.html.HtmlEncodingDetector"><params><param name="markLimit" type="int">64000</param></params></encodingDetector><encodingDetector class="org.apache.tika.parser.txt.UniversalEncodingDetector"><params><param name="markLimit" type="int">64001</param></params></encodingDetector><encodingDetector class="org.apache.tika.parser.txt.Icu4jEncodingDetector"><params><param name="markLimit" type="int">64002</param></params></encodingDetector></encodingDetectors>
</properties>

3、配置类

package cn.js.config;import java.io.IOException;
import java.io.InputStream;
import org.apache.tika.Tika;
import org.apache.tika.config.TikaConfig;
import org.apache.tika.detect.Detector;
import org.apache.tika.exception.TikaException;
import org.apache.tika.parser.AutoDetectParser;
import org.apache.tika.parser.Parser;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.context.annotation.Bean;
import org.springframework.context.annotation.Configuration;
import org.springframework.core.io.Resource;
import org.springframework.core.io.ResourceLoader;
import org.xml.sax.SAXException;/*** tika配置类*/
@Configuration
public class MyTikaConfig {@Autowiredprivate ResourceLoader resourceLoader;@Beanpublic Tika tika() throws TikaException, IOException, SAXException {Resource resource = resourceLoader.getResource("classpath:tika-config.xml");InputStream inputStream = resource.getInputStream();TikaConfig config = new TikaConfig(inputStream);Detector detector = config.getDetector();Parser autoDetectParser = new AutoDetectParser(config);return new Tika(detector, autoDetectParser);}
}

controller

package cn.js.controller;import org.apache.tika.Tika;
import org.apache.tika.exception.TikaException;
import org.springframework.http.HttpRequest;
import org.springframework.web.bind.annotation.PostMapping;
import org.springframework.web.bind.annotation.RequestMapping;
import org.springframework.web.bind.annotation.RequestParam;
import org.springframework.web.bind.annotation.RestController;
import org.springframework.web.multipart.MultipartFile;import javax.annotation.Resource;
import java.io.IOException;
import java.io.InputStream;@RestController
@RequestMapping("/tika")
public class TikaController {@Resourceprivate Tika tika;@PostMapping("/pdf")public void TikaDemon(@RequestParam("file") MultipartFile file) throws IOException, TikaException {InputStream inputStream = file.getInputStream();String s = tika.parseToString(inputStream);System.out.println(s);}}
http://www.lryc.cn/news/396257.html

相关文章:

  • Vue3 监听属性
  • Transformer模型论文解读、源码分析和项目实践
  • 前端部署自动上传资源文件到cdn/oss 解决路由和访问慢的问题
  • Diffusion 公式推导 2
  • layui-表单(输入框)
  • 中职网络安全B模块渗透测试server2380
  • 微信小程序毕业设计-教育培训系统项目开发实战(附源码+论文)
  • 【面试题】正向代理和反向代理的区别?
  • Python面试宝典第8题:二叉树遍历
  • FastReport 指定sql 和修改 数据库连接地址的 工具类 :FastReportHelper
  • C++11中重要的新特性 Part one
  • VB 关键字
  • Linux——多线程(四)
  • InetAddress.getLocalHost().getHostAddress()阻塞导致整个微服务崩溃
  • 在 Qt6 中,QList 和 QVector 统一 成qlist了吗?
  • 第三期书生大模型实战营 第1关 Linux 基础知识
  • 架构设计(1)分布式架构
  • 机器学习笔记:初始化0的问题
  • JavaWeb—js(3)
  • PLSQL Day4
  • git合并报错:git -c core.quotepath=false -c log.showSignature=false merge r
  • 云原生存储:使用MinIO与Spring整合
  • 等保测评新趋势:应对数字化转型中的安全挑战
  • 使用esptool工具备份ESP32的固件(从芯片中备份下来固件)
  • JS进阶-解析赋值
  • Java虚拟机面试题汇总
  • C++休眠的方法
  • 选择排序(C语言版)
  • 基于CentOS Stream 9平台搭建FRP内网穿透
  • Redis管理禁用命令