最近需要使用到敏感词相关业务,上网抄了下dfa的代码实现并在此基础上做了些调整,新增了replace相关方法,代码如下
package sensitive;
import java.util.*;
import java.util.concurrent.ConcurrentHashMap;
import java.util.regex.*;
public class SensitiveWordUtils {
private final static Integer MIN_MATCH_TYPE = 0;
private final static Integer MAX_MATCH_TYPE = -1;
private final static String IS_END = "isEnd";
private final static String END_FALSE = "0";
private final static String END_TRUE = "1";
private static Map sensitiveWordMap = null;
private static Map<String, String> replaceWordMap = null;
private final static String IGNORE_SPECIAL_CHAR_REGEX = "[`~!@#$%^&*()+=|{}':;',\\\\[\\\\].<>/?~!@#¥%……&*()——+|{}【】‘;:”“’。,、?]|\\s*";
public static synchronized void rebuildSensitiveWords(Map<String, String> sensitiveWordMap) {
initSensitiveWords(sensitiveWordMap);
}
private static void initSensitiveWords(Map<String, String> sensitiveWordMap) {
createDFATree(sensitiveWordMap.keySet());
createReplaceWordMap(sensitiveWordMap);
}
private static void createDFATree(Set<String> sensitiveWords) {
sensitiveWordMap = new ConcurrentHashMap();
Iterator<String> it = sensitiveWords.iterator();
while (it.hasNext()) {
String word = it.next();
Map currentMap = sensitiveWordMap;
for (int i = 0; i < word.length(); i++) {
char key = word.charAt(i);
if (isIgnore(key)) {
continue;
}
Object oldValueMap = currentMap.get(key);
if (oldValueMap == null) {
Map newValueMap = new ConcurrentHashMap();
newValueMap.put(IS_END, END_FALSE);
currentMap.put(key, newValueMap);
currentMap = newValueMap;
} else {
currentMap = (Map) oldValueMap;
}
if (i == word.length() - 1) {
currentMap.put(IS_END, END_TRUE);
}
}
}
}
private static void createReplaceWordMap(Map<String, String> sensitiveWordMap) {
replaceWordMap = sensitiveWordMap;
}
public static Set<String> getSensitiveWordMinMatch(String content) {
return getSensitiveWord(content, MIN_MATCH_TYPE);
}
public static Set<String> getSensitiveWordMaxMatch(String content) {
return getSensitiveWord(content, MAX_MATCH_TYPE);
}
private static Set<String> getSensitiveWord(String content, int matchType) {
Set<String> sensitiveWordList = new HashSet<>();
for (int i = 0; i < content.length(); i++) {
int length = checkSensitiveWord(content, i, matchType);
if (length > 0) {
sensitiveWordList.add(content.substring(i, i + length));
i = i + length - 1;
}
}
return sensitiveWordList;
}
private static int checkSensitiveWord(String content, int beginIndex, int matchType) {
boolean flag = false;
int matchedLength = 0;
Map currentWordMap = sensitiveWordMap;
for (int i = beginIndex; i < content.length(); i++) {
char key = content.charAt(i);
if (isIgnore(key)) {
matchedLength++;
continue;
}
currentWordMap = (Map) currentWordMap.get(key);
if (currentWordMap == null) {
break;
} else {
matchedLength++;
if (END_TRUE.equals(currentWordMap.get(IS_END))) {
flag = true;
if (matchType == MIN_MATCH_TYPE) {
break;
}
}
}
}
if (matchedLength < 1 || !flag) {
matchedLength = 0;
}
return matchedLength;
}
private static boolean isIgnore(char specificChar) {
Pattern pattern = Pattern.compile(IGNORE_SPECIAL_CHAR_REGEX);
Matcher matcher = pattern.matcher(String.valueOf(specificChar));
return matcher.matches();
}
public static String replaceSensitiveWordMinMatch(String content) {
return replaceSensitiveWord(content, MIN_MATCH_TYPE);
}
public static String replaceSensitiveWordMaxMatch(String content) {
return replaceSensitiveWord(content, MAX_MATCH_TYPE);
}
private static String replaceSensitiveWord(String content, int matchType) {
StringBuffer sb = new StringBuffer();
for (int i = 0; i < content.length(); i++) {
Object[] arr = matchSensitiveWord(content, i, matchType);
if ((Integer) arr[1] > 0) {
sb.append(replaceWordMap.get(arr[0]));
i = i + (Integer) arr[1] - 1;
} else {
sb.append(content.charAt(i));
}
}
return sb.toString();
}
private static Object[] matchSensitiveWord(String content, int beginIndex, int matchType) {
StringBuffer sb = new StringBuffer();
boolean flag = false;
int matchedLength = 0;
Map currentWordMap = sensitiveWordMap;
for (int i = beginIndex; i < content.length(); i++) {
char key = content.charAt(i);
if (isIgnore(key)) {
matchedLength++;
continue;
}
currentWordMap = (Map) currentWordMap.get(key);
if (currentWordMap == null) {
break;
} else {
sb.append(key);
matchedLength++;
if (END_TRUE.equals(currentWordMap.get(IS_END))) {
flag = true;
if (matchType == MIN_MATCH_TYPE) {
break;
}
}
}
}
if (matchedLength < 1 || !flag) {
matchedLength = 0;
}
Object[] arr = new Object[2];
arr[0] = sb.toString();
arr[1] = matchedLength;
return arr;
}
public static void main(String[] args) {
Map<String, String> sensitiveWordMap = new ConcurrentHashMap<String, String>();
sensitiveWordMap.put("西巴", "$$");
rebuildSensitiveWords(sensitiveWordMap);
Set<String> resultSet = getSensitiveWordMinMatch("小*,*西&&&巴花美元买豆腐");
for (String str : resultSet) {
System.out.println(str);
}
System.out.println(replaceSensitiveWordMinMatch("小*,*西&&&巴花美元买豆腐"));
System.out.println("===============");
sensitiveWordMap.put("美元", "######");
rebuildSensitiveWords(sensitiveWordMap);
resultSet = getSensitiveWordMinMatch("小*,*西&&&巴花美元买豆腐");
for (String str : resultSet) {
System.out.println(str);
}
System.out.println(replaceSensitiveWordMinMatch("小*,*西&&&巴花美元买豆腐"));
}
}