高性能AC算法多关键词匹配文本功能Java实现
直接上测试结果:
1000000数据集。 1000000关键词(匹配词)
装载消耗时间:20869 毫秒
匹配消耗时间:6599 毫秒
代码和测试案例:
package com.baian.tggroupmessagematchkeyword.ac;import lombok.Data;import java.util.*;/*** @program: tg-parent* @description: ac* @author: <发哥讲Java-694204477@qq.com>* @create: 2023-09-19 17:20**/
@Data
public class AhoCorasick {private TrieNode root;public AhoCorasick() {root = new TrieNode();}public void addKeyword(String keyword) {TrieNode current = root;for (char ch : keyword.toCharArray()) {current = current.getChildren().computeIfAbsent(ch, c -> new TrieNode());}current.setEndOfWord(true);current.addKeyword(keyword);}public void buildFailureLinks() {Queue<TrieNode> queue = new LinkedList<>();root.setFailure(null);queue.offer(root);while (!queue.isEmpty()) {TrieNode current = queue.poll();for (TrieNode child : current.getChildren().values()) {TrieNode failure = current.getFailure();while (failure != null && !failure.getChildren().containsKey(child.getKey())) {failure = failure.getFailure();}if (failure == null) {child.setFailure(root);} else {child.setFailure(failure.getChildren().get(child.getKey()));child.addAllKeywords(child.getFailure().getKeywords());}queue.offer(child);}}}public List<String> searchKeywords(String text) {List<String> result = new ArrayList<>();TrieNode current = root;for (int i = 0; i < text.length(); i++) {char ch = text.charAt(i);while (current != null && !current.getChildren().containsKey(ch)) {current = current.getFailure();}if (current == null) {current = root;} else {current = current.getChildren().get(ch);if (current.isEndOfWord()) {result.addAll(current.getKeywords());}TrieNode failure = current.getFailure();while (failure != null) {if (failure.isEndOfWord()) {result.addAll(failure.getKeywords());}failure = failure.getFailure();}}}return result;}public static class TrieNode {private char key;private boolean endOfWord;private TrieNode failure;private Map<Character, TrieNode> children;private List<String> keywords;public TrieNode() {children = new HashMap<>();keywords = new ArrayList<>();}public char getKey() {return key;}public void setKey(char key) {this.key = key;}public boolean isEndOfWord() {return endOfWord;}public void setEndOfWord(boolean endOfWord) {this.endOfWord = endOfWord;}public TrieNode getFailure() {return failure;}public void setFailure(TrieNode failure) {this.failure = failure;}public Map<Character, TrieNode> getChildren() {return children;}public List<String> getKeywords() {return keywords;}public void addKeyword(String keyword) {keywords.add(keyword);}public void addAllKeywords(List<String> keywords) {this.keywords.addAll(keywords);}}
}
main:
package test;import com.baian.tggroupmessagematchkeyword.ac.AhoCorasick;import java.util.ArrayList;
import java.util.List;
import java.util.UUID;/*** @program: tg-parent* @description: 多样本数据集 测试。* @author: <发哥讲Java-694204477@qq.com>* @create: 2023-09-19 14:11**/
public class TestMain001 {public static void main(String[] args) {long start0 = System.currentTimeMillis();List<String> datas = new ArrayList<>(1000000);for (int i = 0; i < 1000000; i++) {datas.add(UUID.randomUUID().toString() + UUID.randomUUID().toString());}AhoCorasick ahoCorasick2 = new AhoCorasick();for (int i = 0; i < 1000000; i++) {ahoCorasick2.addKeyword(UUID.randomUUID().toString());}ahoCorasick2.addKeyword("11");ahoCorasick2.addKeyword("22");ahoCorasick2.buildFailureLinks();long end0 = System.currentTimeMillis();System.out.println("装载消耗时间:" + (end0 - start0));long start = System.currentTimeMillis();for (String message : datas) {List<String> stringList = ahoCorasick2.searchKeywords(message);if (stringList.size() > 0) {
// System.out.println(stringList + " message:" + message + " size:" + stringList.size());}}long end = System.currentTimeMillis();System.out.println("消耗时间:" + (end - start));}
}