敏感词过滤工具

本文介绍了如何使用dfa算法实现敏感词检测,并展示了如何新增replace方法,以实现在文本中查找并替换敏感词。通过实例演示了如何构建敏感词地图和替换词典,以及如何在内容中进行最小匹配和最大匹配的敏感词替换。

摘要生成于 C知道 ,由 DeepSeek-R1 满血版支持, 前往体验 >

最近需要使用到敏感词相关业务,上网抄了下dfa的代码实现并在此基础上做了些调整,新增了replace相关方法,代码如下
package sensitive;

import java.util.*;
import java.util.concurrent.ConcurrentHashMap;
import java.util.regex.*;

public class SensitiveWordUtils {

	private final static Integer MIN_MATCH_TYPE = 0;

	private final static Integer MAX_MATCH_TYPE = -1;

	private final static String IS_END = "isEnd";

	private final static String END_FALSE = "0";

	private final static String END_TRUE = "1";

	private static Map sensitiveWordMap = null;

	private static Map<String, String> replaceWordMap = null;

	private final static String IGNORE_SPECIAL_CHAR_REGEX = "[`~!@#$%^&*()+=|{}':;',\\\\[\\\\].<>/?~!@#¥%……&*()——+|{}【】‘;:”“’。,、?]|\\s*";

	public static synchronized void rebuildSensitiveWords(Map<String, String> sensitiveWordMap) {
		initSensitiveWords(sensitiveWordMap);
	}

	private static void initSensitiveWords(Map<String, String> sensitiveWordMap) {
		createDFATree(sensitiveWordMap.keySet());
		createReplaceWordMap(sensitiveWordMap);
	}

	private static void createDFATree(Set<String> sensitiveWords) {
		sensitiveWordMap = new ConcurrentHashMap();
		Iterator<String> it = sensitiveWords.iterator();
		while (it.hasNext()) {
			String word = it.next();
			Map currentMap = sensitiveWordMap;
			for (int i = 0; i < word.length(); i++) {
				char key = word.charAt(i);
				if (isIgnore(key)) {
					continue;
				}
				Object oldValueMap = currentMap.get(key);
				if (oldValueMap == null) {
					Map newValueMap = new ConcurrentHashMap();
					newValueMap.put(IS_END, END_FALSE);
					currentMap.put(key, newValueMap);
					currentMap = newValueMap;
				} else {
					currentMap = (Map) oldValueMap;
				}

				if (i == word.length() - 1) {
					currentMap.put(IS_END, END_TRUE);
				}
			}
		}
	}

	private static void createReplaceWordMap(Map<String, String> sensitiveWordMap) {
		replaceWordMap = sensitiveWordMap;
	}

	public static Set<String> getSensitiveWordMinMatch(String content) {
		return getSensitiveWord(content, MIN_MATCH_TYPE);
	}

	public static Set<String> getSensitiveWordMaxMatch(String content) {
		return getSensitiveWord(content, MAX_MATCH_TYPE);
	}

	private static Set<String> getSensitiveWord(String content, int matchType) {
		Set<String> sensitiveWordList = new HashSet<>();
		for (int i = 0; i < content.length(); i++) {
			int length = checkSensitiveWord(content, i, matchType);
			if (length > 0) {
				sensitiveWordList.add(content.substring(i, i + length));
				i = i + length - 1;
			}
		}
		return sensitiveWordList;
	}

	private static int checkSensitiveWord(String content, int beginIndex, int matchType) {
		boolean flag = false;
		int matchedLength = 0;
		Map currentWordMap = sensitiveWordMap;
		for (int i = beginIndex; i < content.length(); i++) {
			char key = content.charAt(i);
			if (isIgnore(key)) {
				matchedLength++;
				continue;
			}
			currentWordMap = (Map) currentWordMap.get(key);
			if (currentWordMap == null) {
				break;
			} else {
				matchedLength++;
				if (END_TRUE.equals(currentWordMap.get(IS_END))) {
					flag = true;
					if (matchType == MIN_MATCH_TYPE) {
						break;
					}
				}
			}
		}
		if (matchedLength < 1 || !flag) {
			matchedLength = 0;
		}
		return matchedLength;
	}

	private static boolean isIgnore(char specificChar) {
		Pattern pattern = Pattern.compile(IGNORE_SPECIAL_CHAR_REGEX);
		Matcher matcher = pattern.matcher(String.valueOf(specificChar));
		return matcher.matches();
	}

	public static String replaceSensitiveWordMinMatch(String content) {
		return replaceSensitiveWord(content, MIN_MATCH_TYPE);
	}

	public static String replaceSensitiveWordMaxMatch(String content) {
		return replaceSensitiveWord(content, MAX_MATCH_TYPE);
	}

	private static String replaceSensitiveWord(String content, int matchType) {
		StringBuffer sb = new StringBuffer();
		for (int i = 0; i < content.length(); i++) {
			Object[] arr = matchSensitiveWord(content, i, matchType);
			if ((Integer) arr[1] > 0) {
				sb.append(replaceWordMap.get(arr[0]));
				i = i + (Integer) arr[1] - 1;
			} else {
				sb.append(content.charAt(i));
			}
		}
		return sb.toString();
	}

	private static Object[] matchSensitiveWord(String content, int beginIndex, int matchType) {
		StringBuffer sb = new StringBuffer();
		boolean flag = false;
		int matchedLength = 0;
		Map currentWordMap = sensitiveWordMap;
		for (int i = beginIndex; i < content.length(); i++) {
			char key = content.charAt(i);
			if (isIgnore(key)) {
				matchedLength++;
				continue;
			}
			currentWordMap = (Map) currentWordMap.get(key);
			if (currentWordMap == null) {
				break;
			} else {
				sb.append(key);
				matchedLength++;
				if (END_TRUE.equals(currentWordMap.get(IS_END))) {
					flag = true;
					if (matchType == MIN_MATCH_TYPE) {
						break;
					}
				}
			}
		}
		if (matchedLength < 1 || !flag) {
			matchedLength = 0;
		}
		Object[] arr = new Object[2];
		arr[0] = sb.toString();
		arr[1] = matchedLength;
		return arr;
	}

	public static void main(String[] args) {
		Map<String, String> sensitiveWordMap = new ConcurrentHashMap<String, String>();
		sensitiveWordMap.put("西巴", "$$");
		rebuildSensitiveWords(sensitiveWordMap);
		Set<String> resultSet = getSensitiveWordMinMatch("小*,*西&&&巴花美元买豆腐");
		for (String str : resultSet) {
			System.out.println(str);
		}
		System.out.println(replaceSensitiveWordMinMatch("小*,*西&&&巴花美元买豆腐"));
		System.out.println("===============");
		sensitiveWordMap.put("美元", "######");
		rebuildSensitiveWords(sensitiveWordMap);
		resultSet = getSensitiveWordMinMatch("小*,*西&&&巴花美元买豆腐");
		for (String str : resultSet) {
			System.out.println(str);
		}
		System.out.println(replaceSensitiveWordMinMatch("小*,*西&&&巴花美元买豆腐"));
	}
}

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值