当前位置: 首页 > news >正文

高性能AC算法多关键词匹配文本功能Java实现

直接上测试结果:

1000000数据集。
1000000关键词(匹配词)

装载消耗时间:20869 毫秒
匹配消耗时间:6599 毫秒

代码和测试案例:

package com.baian.tggroupmessagematchkeyword.ac;import lombok.Data;import java.util.*;/*** @program: tg-parent* @description: ac* @author: <发哥讲Java-694204477@qq.com>* @create: 2023-09-19 17:20**/
@Data
public class AhoCorasick {private TrieNode root;public AhoCorasick() {root = new TrieNode();}public void addKeyword(String keyword) {TrieNode current = root;for (char ch : keyword.toCharArray()) {current = current.getChildren().computeIfAbsent(ch, c -> new TrieNode());}current.setEndOfWord(true);current.addKeyword(keyword);}public void buildFailureLinks() {Queue<TrieNode> queue = new LinkedList<>();root.setFailure(null);queue.offer(root);while (!queue.isEmpty()) {TrieNode current = queue.poll();for (TrieNode child : current.getChildren().values()) {TrieNode failure = current.getFailure();while (failure != null && !failure.getChildren().containsKey(child.getKey())) {failure = failure.getFailure();}if (failure == null) {child.setFailure(root);} else {child.setFailure(failure.getChildren().get(child.getKey()));child.addAllKeywords(child.getFailure().getKeywords());}queue.offer(child);}}}public List<String> searchKeywords(String text) {List<String> result = new ArrayList<>();TrieNode current = root;for (int i = 0; i < text.length(); i++) {char ch = text.charAt(i);while (current != null && !current.getChildren().containsKey(ch)) {current = current.getFailure();}if (current == null) {current = root;} else {current = current.getChildren().get(ch);if (current.isEndOfWord()) {result.addAll(current.getKeywords());}TrieNode failure = current.getFailure();while (failure != null) {if (failure.isEndOfWord()) {result.addAll(failure.getKeywords());}failure = failure.getFailure();}}}return result;}public static class TrieNode {private char key;private boolean endOfWord;private TrieNode failure;private Map<Character, TrieNode> children;private List<String> keywords;public TrieNode() {children = new HashMap<>();keywords = new ArrayList<>();}public char getKey() {return key;}public void setKey(char key) {this.key = key;}public boolean isEndOfWord() {return endOfWord;}public void setEndOfWord(boolean endOfWord) {this.endOfWord = endOfWord;}public TrieNode getFailure() {return failure;}public void setFailure(TrieNode failure) {this.failure = failure;}public Map<Character, TrieNode> getChildren() {return children;}public List<String> getKeywords() {return keywords;}public void addKeyword(String keyword) {keywords.add(keyword);}public void addAllKeywords(List<String> keywords) {this.keywords.addAll(keywords);}}
}

main:

package test;import com.baian.tggroupmessagematchkeyword.ac.AhoCorasick;import java.util.ArrayList;
import java.util.List;
import java.util.UUID;/*** @program: tg-parent* @description: 多样本数据集 测试。* @author: <发哥讲Java-694204477@qq.com>* @create: 2023-09-19 14:11**/
public class TestMain001 {public static void main(String[] args) {long start0 = System.currentTimeMillis();List<String> datas = new ArrayList<>(1000000);for (int i = 0; i < 1000000; i++) {datas.add(UUID.randomUUID().toString() + UUID.randomUUID().toString());}AhoCorasick ahoCorasick2 = new AhoCorasick();for (int i = 0; i < 1000000; i++) {ahoCorasick2.addKeyword(UUID.randomUUID().toString());}ahoCorasick2.addKeyword("11");ahoCorasick2.addKeyword("22");ahoCorasick2.buildFailureLinks();long end0 = System.currentTimeMillis();System.out.println("装载消耗时间:" + (end0 - start0));long start = System.currentTimeMillis();for (String message : datas) {List<String> stringList = ahoCorasick2.searchKeywords(message);if (stringList.size() > 0) {
//                System.out.println(stringList + " message:" + message + " size:" + stringList.size());}}long end = System.currentTimeMillis();System.out.println("消耗时间:" + (end - start));}
}

http://www.lryc.cn/news/172951.html

相关文章:

  • 如何在没有第三方.NET库源码的情况,调试第三库代码?
  • 仿互站资源商城平台系统源码多款应用模版
  • 华为云云耀云服务器L实例评测 | L实例性能测试实践
  • VR赋能红色教育,让爱国主义精神永放光彩
  • 计算机视觉与深度学习-卷积神经网络-卷积图像去噪边缘提取-图像去噪 [北邮鲁鹏]
  • 三行代码实现图像画质修复,图片清晰度修复,清晰度提升python
  • 企业电子招投标采购系统源码之电子招投标的组成
  • 【MySQL】 MySQL的增删改查(进阶)--贰
  • 第七章 查找
  • openfeign返回消息报错.UnknownContentTypeException
  • [Linux入门]---Linux项目自动化构建工具-make/Makefile
  • [Python进阶] 程序打包之Pyinstaller参数介绍
  • Python中如何判断列表中的元素,是否在一段文本中??
  • spark Structured报错解决
  • Matter 协议系列:发现
  • Oracle 12c Docker镜像配置SSL
  • 版本控制系统git:一文了解git,以及它在生活中的应用,网站维护git代码,图导,自动化部署代码
  • uqrcode+uni-app 微信小程序生成二维码
  • 从零开始的 MyBatis 拦截器之旅:实战经验分享
  • 网络编程day05(IO多路复用)
  • 人声分离网站,帮你快速提取视频中的人声和背景音乐
  • 计算机网络常见问题
  • 上PICO,沉浸式观看亚运直播,参与跨国界游戏竞技
  • 无重复字符的最长子串 - 力扣(LeetCode)
  • 企业行政许可的种类有哪些?
  • Flink--4、DateStream API(执行环境、源算子、基本转换算子)
  • #循循渐进学51单片机#指针基础与1602液晶的初步认识#not.11
  • Lua学习笔记:探究package
  • 【面试经典150 | 双指针】三数之和
  • 现代卷积网络实战系列3:PyTorch从零构建AlexNet训练MNIST数据集