文章目录
一、应用场景
- 1、网站内容发布
- 2、评论、留言、回复
- 3、社交媒体平台
- …
二、实战示例
三、敏感词过滤服务介绍
- 基于注解+DFA算法,实现自定义敏感词过滤,自定义替换字符
- 两种注解方式:
- 方法参数注解
- 实体类注解
- 两种用户体验模式
- 直接告诉用户存在哪些敏感词
- 直接替换敏感词为自定义字符
四、具体代码实现
1、初始化敏感词到redis
该实现类进行敏感词存入Redis操作。
package com.tb.sensitiveword.service.impl;
import cn.hutool.core.collection.CollectionUtil;
import com.tb.sensitiveword.constant.GlobalConstants;
import com.tb.sensitiveword.service.ISensitiveWordFilterService;
import com.tb.sensitiveword.util.RedisCache;
import org.springframework.stereotype.Service;
import javax.annotation.Resource;
import java.util.List;
/**
* 敏感词过滤 Service实现类
*
* @author tb
* @since 2024-04-09
*/
@Service
public class SensitiveWordFilterServiceImpl implements ISensitiveWordFilterService {
@Resource
private RedisCache redisCache;
/**
* 初始化敏感词到redis
*
* @param words
* @return
*/
@Override
public boolean initSensitiveWord2Redis(List<String> words) {
// 这里的words可以从数据库读取
if (CollectionUtil.isEmpty(words)) {
return false;
}
//先删除redis中已经存在的旧敏感词数据
redisCache.deleteObject(GlobalConstants.REDIS_KEY_PREFIX + GlobalConstants.REDIS_KEY);
// 再插入最新的敏感词数据
redisCache.setCacheList(GlobalConstants.REDIS_KEY_PREFIX + GlobalConstants.REDIS_KEY, words);
return true;
}
/**
* 获取redis中的敏感词列表
*
* @return
*/
@Override
public List<String> sensitiveWordsFromRedis() {
return redisCache.getCacheList(GlobalConstants.REDIS_KEY_PREFIX + GlobalConstants.REDIS_KEY);
}
}
2、敏感词-前缀树操作工具类(DFA算法)
该工具类实现文本数据根据DFA算法进行插入、替换、查询。
package com.tb.sensitiveword.util;
import cn.hutool.core.collection.CollectionUtil;
import com.tb.sensitiveword.constant.GlobalConstants;
import com.tb.sensitiveword.model.entity.TrieNode;
import com.tb.sensitiveword.service.ISensitiveWordFilterService;
import org.apache.commons.lang3.CharUtils;
import org.apache.commons.lang3.StringUtils;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.stereotype.Component;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
/**
* 敏感词-前缀树操作工具类
*/
@Component
public class TrieOperateUtil {
@Autowired
private ISensitiveWordFilterService sensitiveWordFilterService;
//根节点
private TrieNode rootNode = new TrieNode();
/**
* 添加m敏感词
*
* @param word 词
*/
public void addWord(String word) {
TrieNode tmpNode = rootNode;
for (int i = 0; i < word.length(); i++) {
char c = word.charAt(i);
TrieNode node = tmpNode.getSubNode(c);
if (null == node) {
//初始化子节点
node = new TrieNode();
tmpNode.addSubNode(c, node);
}
//指向子节点,进入下一轮循环
tmpNode = node;
//设置结束标志
if (i == word.length() - 1) {
tmpNode.setKeywordEnd(true);
}
}
}
/**
* 替换敏感词
*
* @param text 待处理文本
* @param afterReplace 替换后的词
* @return 处理后的文本
*/
public String replace(String text, String afterReplace) {
if (StringUtils.isBlank(text)) {
return null;
}
StringBuilder result = new StringBuilder();
TrieNode tmpNode = rootNode;
//指针2、指针3
int begin = 0, pos = 0;
// 循环替换敏感词
while (pos < text.length()) {
char c = text.charAt(pos);
if (isSymbol(c)) {
//若处于根节点,对应情况一,将符号计入结果,让指针2向下走一步
if (tmpNode == rootNode) {
result.append(c);
begin++;
}
//无论符号在开头还是敏感词中间,指针3都向下走一步
pos++;
continue;
}
// 获取子节点
tmpNode = tmpNode.getSubNode(c);
if (null == tmpNode) {
// 以begin开头的的字符串不是敏感词
result.append(text.charAt(begin));
// 指针2和指针3共同指向指针2的下一个位置
pos = ++begin;
// 指向根节点
tmpNode = rootNode;
} else if (tmpNode.isLastCharacter()) { //如果是最后一个词
// 匹配完成, 进行替换
result.append(StringUtils.isEmpty(afterReplace) ? GlobalConstants.REPLACEMENT : afterReplace);
// 进入下一个位置
begin = ++pos;
// 重新指向根节点
tmpNode = rootNode;
} else {
// 检查下一个字符
pos++;
}
}
result.append(text.substring(begin));
return result.toString();
}
/**
* 判断是否为符号
*
* @param c
* @return
*/
private boolean isSymbol(Character c) {
// 0x2E80~0x9FFF 是东亚文字范围
return !CharUtils.isAsciiAlphanumeric(c) && (c < 0x2E80 || c > 0x9FFF);
}
/**
* 查找敏感词
*
* @param text 待处理文本
* @return 统计数据 key: word value: count
*/
public Map<String, Integer> find(String text) {
Map<String, Integer> resultMap = new HashMap<>(16);
TrieNode tmpNode = rootNode;
StringBuilder word = new StringBuilder();
// 指针2、指针3
int begin = 0, pos = 0;
// 循环查找敏感词
while (pos < text.length()) {
char c = text.charAt(pos);
tmpNode = tmpNode.getSubNode(c);
if (null == tmpNode) {
// 指针2和指针3共同指向指针2的下一个位置
pos = ++begin;
// 指向根节点
tmpNode = rootNode;
} else if (tmpNode.isLastCharacter()) {
// 匹配完成
String w = word.append(c).toString();
resultMap.put(w, resultMap.getOrDefault(w, 0) + 1);
// 进入下一个位置
begin = ++pos;
// 指向根节点
tmpNode = rootNode;
// 重置
word = new StringBuilder();
} else {
// 拼接
word.append(c);
// 匹配上向后移
pos++;
}
}
return resultMap;
}
/**
* 获取敏感词列表并插入TrieNode
*
*/
public List<String> sensitiveWordsFromRedisAndSet() {
// 获取Redis中的敏感词列表
List<String> words = sensitiveWordFilterService.sensitiveWordsFromRedis();
if (CollectionUtil.isNotEmpty(words)) {
// 循环插入TrieNode
for (String word : words) {
addWord(word);
}
}
return words;
}
}
3、敏感词过滤自定义注解
该注解为是否开启敏感词过滤的注解,需要和下面FilterSensitiveWords 注解结合使用。
package com.tb.sensitiveword.annotation;
import java.lang.annotation.ElementType;
import java.lang.annotation.Retention;
import java.lang.annotation.RetentionPolicy;
import java.lang.annotation.Target;
/**
* 敏感词过滤自定义注解
*/
@Retention(RetentionPolicy.RUNTIME)
@Target({ElementType.METHOD})
public @interface ValidSensitiveWords {
/**
* 是否校验 (默认为false)
* @return
*/
boolean isValid() default false;
}
4、敏感词过滤自定义参数(字段)注解
该注解为参数(字段)注解,需要ValidSensitiveWords.isValid为true时才生效。
package com.tb.sensitiveword.annotation;
import com.tb.sensitiveword.constant.GlobalConstants;
import java.lang.annotation.ElementType;
import java.lang.annotation.Retention;
import java.lang.annotation.RetentionPolicy;
import java.lang.annotation.Target;
/**
* 敏感词过滤自定义参数(字段)注解
*/
@Retention(RetentionPolicy.RUNTIME)
@Target({ElementType.PARAMETER, ElementType.FIELD})
public @interface FilterSensitiveWords {
/**
* 敏感词替换字符
* @return
*/
String replacement() default GlobalConstants.REPLACEMENT;
/**
* 是否替换
* @return
*/
boolean isReplace() default GlobalConstants.IS_REPLACE;
}
5、AOP切面
该切面为敏感词过滤的具体实现。
package com.tb.sensitiveword.aop;
import cn.hutool.core.collection.CollectionUtil;
import com.alibaba.fastjson2.JSONObject;
import com.tb.sensitiveword.annotation.FilterSensitiveWords;
import com.tb.sensitiveword.annotation.ValidSensitiveWords;
import com.tb.sensitiveword.util.TrieOperateUtil;
import org.aspectj.lang.ProceedingJoinPoint;
import org.aspectj.lang.annotation.Around;
import org.aspectj.lang.annotation.Aspect;
import org.aspectj.lang.annotation.Pointcut;
import org.aspectj.lang.reflect.MethodSignature;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.stereotype.Component;
import org.springframework.util.StringUtils;
import java.lang.reflect.Field;
import java.lang.reflect.Parameter;
import java.util.*;
import java.util.stream.Collectors;
/**
* 敏感词过滤 aop切面实现
*
* @author tb
* @since 2024-04-22
*/
@Aspect
@Component
public class SensitiveWordsAspect {
private static final Logger logger = LoggerFactory.getLogger(SensitiveWordsAspect.class);
@Autowired
private TrieOperateUtil operateUtil;
@Pointcut("@annotation(com.tb.sensitiveword.annotation.ValidSensitiveWords)")
public void pointcut() {
}
/**
* 环绕处理
*
* @param joinPoint
* @return
* @throws Throwable
*/
@Around("SensitiveWordsAspect.pointcut()")
public Object filterSensitiveWords(ProceedingJoinPoint joinPoint) throws Throwable {
// 获取参数
Object[] args = joinPoint.getArgs();
// 获取是否需要进行敏感词校验
if (joinPoint.getSignature() instanceof MethodSignature) {
MethodSignature methodSignature = (MethodSignature) joinPoint.getSignature();
ValidSensitiveWords anno = methodSignature.getMethod().getAnnotation(ValidSensitiveWords.class);
if (!anno.isValid()) {
return joinPoint.proceed();
}
// 获取所有方法的参数名称
Parameter[] parameters = methodSignature.getMethod().getParameters();
// 遍历方法的参数名称
for (int i = 0; i < parameters.length; i++) {
Parameter parameter = parameters[i];
// 字段(参数)敏感词过滤
JSONObject result = fieldSensitiveWorldFilter(args, i, parameter);
if (result.containsKey("isExist") && result.getBoolean("isExist")) {
String words = result.getString("words");
throw new RuntimeException("存在敏感内容【" + words + "】,请重新输入!");
}
}
}
return joinPoint.proceed(args);
}
/**
* 字段(参数)敏感词过滤
*
* @param args
* @param i
* @param parameter
* @throws IllegalAccessException
*/
private JSONObject fieldSensitiveWorldFilter(Object[] args, int i, Parameter parameter) throws IllegalAccessException {
JSONObject jsonObj = new JSONObject();
Set<String> set = new HashSet<>();
// 获取参数类型
Class<?> type = parameter.getType();
// 如果是字符串类型
if (type == String.class) {
FilterSensitiveWords filterSensitiveWords = parameter.getAnnotation(FilterSensitiveWords.class);
if (filterSensitiveWords != null) {
// 获取参数值
String text = String.valueOf(args[i]);
if (filterSensitiveWords.isReplace()) {
// 敏感词过滤替换后的字段值
String newText = replaceWord(filterSensitiveWords, text);
// 替换原来值
args[i] = newText;
} else {
// 查询敏感词并获取敏感词数据
JSONObject result = findWord(text);
if (result.getBoolean("isExist")) {
jsonObj.put("isExist", result.getBoolean("isExist"));
// set存放去重
set.addAll(result.getJSONObject("wordsMap").keySet());
}
}
}
}
// 判断是否自定义类,classLoader等于null的时候不是自定义类
if (type.getClassLoader() != null) {
// 获取自定义对象的所有字段
Field[] declaredFields = type.getDeclaredFields();
Object obj = args[i];
for (Field declaredField : declaredFields) {
// 判断该字段上是否有敏感词过滤注解
if (declaredField.getAnnotation(FilterSensitiveWords.class) != null) {
FilterSensitiveWords filterSensitiveWords = declaredField.getAnnotation(FilterSensitiveWords.class);
if (declaredField.getType() == String.class) {
// 取消该字段的安全访问检查
declaredField.setAccessible(true);
// 获取字段值
String fieldValue = String.valueOf(declaredField.get(obj));
if (filterSensitiveWords.isReplace()) {
// 敏感词过滤替换后的字段值
String newText = replaceWord(filterSensitiveWords, fieldValue);
// 替换原来值
declaredField.set(obj, newText);
} else {
JSONObject result = findWord(fieldValue);
if (result.getBoolean("isExist")) {
jsonObj.put("isExist", result.getBoolean("isExist"));
// set存放去重
set.addAll(result.getJSONObject("wordsMap").keySet());
}
}
}
}
}
}
jsonObj.put("words", set.stream().map(String::valueOf).collect(Collectors.joining(",")));
return jsonObj;
}
/**
* 敏感词过滤替换
*
* @param filterSensitiveWords
* @param fieldValue
* @return
*/
private String replaceWord(FilterSensitiveWords filterSensitiveWords, String fieldValue) {
StringBuffer newText = new StringBuffer();
// 从redis中获取敏感词列表并放入TrieNode
List<String> words = sensitiveWordsFromRedisAndSet();
if (CollectionUtil.isNotEmpty(words)) {
// 获取敏感词过滤后的值
String text = operateUtil.replace(fieldValue, StringUtils.hasText(filterSensitiveWords.replacement()) ? filterSensitiveWords.replacement() : null);
// 值拼接
newText.append(text);
}
return StringUtils.hasText(newText.toString()) ? newText.toString() : fieldValue;
}
/**
* 敏感词查询
*
* @param fieldValue
* @return
*/
private JSONObject findWord(String fieldValue) {
JSONObject result = new JSONObject();
boolean isExist = false;
Map<String, Integer> wordsMap = new HashMap<>();
// 从redis中获取敏感词列表并放入TrieNode
List<String> words = sensitiveWordsFromRedisAndSet();
if (CollectionUtil.isNotEmpty(words)) {
// 查询敏感词并返回对应的词map
wordsMap = operateUtil.find(fieldValue);
if (CollectionUtil.isNotEmpty(wordsMap)) {
isExist = true;
}
}
result.put("isExist", isExist);
result.put("wordsMap", new JSONObject(wordsMap));
return result;
}
/**
* 从redis中获取敏感词列表并放入TrieNode
*
* @return
*/
private List<String> sensitiveWordsFromRedisAndSet() {
List<String> words = operateUtil.sensitiveWordsFromRedisAndSet();
return words;
}
}
6、基于注解实现敏感词过滤-两种方式
package com.tb.sensitiveword.controller;
import com.tb.sensitiveword.annotation.FilterSensitiveWords;
import com.tb.sensitiveword.annotation.ValidSensitiveWords;
import com.tb.sensitiveword.model.entity.News;
import com.tb.sensitiveword.model.entity.WordDTO;
import com.tb.sensitiveword.service.ISensitiveWordFilterService;
import com.tb.sensitiveword.util.ResponseResult;
import org.springframework.web.bind.annotation.*;
import javax.annotation.Resource;
/**
* SensitiveWordFilterController
* 敏感词过滤服务Controller
*
* @author tb
* @version 1.0
* @date 2024/4/25 23:59
*/
@RestController
@RequestMapping("/sensitive-word")
public class SensitiveWordFilterController {
@Resource
private ISensitiveWordFilterService sensitiveWordFilterService;
/**
* 初始化敏感词到redis
*/
@PostMapping("/initWords")
public void initSensitiveWords2Redis(@RequestBody WordDTO wordDTO) {
sensitiveWordFilterService.initSensitiveWord2Redis(wordDTO.getWords());
}
/**
* 1、方法参数注解
*
* 保存新闻
*
* ValidSensitiveWords注解
* isValid: 是否过滤校验,默认false(不过滤)
*
* FilterSensitiveWords注解
* isReplace: 是否替换敏感词,默认false(不替换)
* replacement: 替换敏感词后的内容,默认替换为***,可自定义
* @param content
* @return
*/
@PostMapping("/saveNews/{content}")
@ValidSensitiveWords(isValid = true)
public ResponseResult<String> saveNews(@PathVariable("content") @FilterSensitiveWords(isReplace = true, replacement = "###") String content) {
return ResponseResult.okResult(null);
}
/**
* 2、实体类注解
*
* 保存新闻
*
* ValidSensitiveWords注解
* isValid: 是否过滤校验,默认false(不过滤)
*
* FilterSensitiveWords注解
* isReplace: 是否替换敏感词,默认false(不替换)
* replacement: 替换敏感词后的内容,默认替换为***,可自定义
* @param news
* @return
*/
@PostMapping("/saveNews")
@ValidSensitiveWords(isValid = true)
public ResponseResult<News> saveNews(@RequestBody News news) {
return ResponseResult.okResult(news);
}
}
实体类:News
package com.tb.sensitiveword.model.entity;
import com.fasterxml.jackson.annotation.JsonFormat;
import com.tb.sensitiveword.annotation.FilterSensitiveWords;
import lombok.AllArgsConstructor;
import lombok.Data;
import lombok.NoArgsConstructor;
import org.springframework.format.annotation.DateTimeFormat;
import java.io.Serializable;
import java.time.LocalDateTime;
/**
* News 实体类
*
* @author tb
* @version 1.0
* @date 2024/4/26 0:09
*/
@Data
@NoArgsConstructor
@AllArgsConstructor
public class News implements Serializable {
private static final long serialVersionUID = 1L;
// 主键
String id;
// 标题
@FilterSensitiveWords
String title;
// 内容
@FilterSensitiveWords
String content;
// 作者
String author;
// 来源
String source;
// 发布时间
//@DateTimeFormat(pattern = "yyyy-MM-dd HH:mm")
@JsonFormat(pattern = "yyyy-MM-dd HH:mm")
LocalDateTime publishTime;
}
五、源码资源下载
SpringBoot 项目实战-敏感词过滤】基于自定义注解+DFA算法,实现自定义敏感词过滤、自定义字符替换源码免费下载
参考文献
1、如何使用Java实现敏感词过滤的功能
2、DFA算法详解