问答社区项目(关键技术)发表提问时的敏感词过滤

2018-07-13

敏感词过滤

　　项目在用户提问时需要过滤掉恶意的js注入攻击以及一些涉及到广告，色情，脏话等敏感词汇需要过滤掉以保证我们网站的专业性和给用户整洁，专业的体验。

　　关于敏感词的过滤：我们采用利用3个指针结合字典树的算法。首先，提取出所有的敏感词，将敏感词构建成一颗字典树，利用一个指针指向字典树的root结点。然后将要过滤的字符串当做输入，利用两个指针来控制当前遍历的字符，初始时这两个指针都在首字符，如果没有遇到敏感词，这两个指针都往后走，如果发现了敏感词，其中一个指针不动，另外一个指针往后走，同时root结点的指针也顺着找到的这个敏感词往子节点走，如果匹配了敏感词，就将该敏感词在输出的StringBuilder中用”***”替代，不匹配的时候都将原字符串中的字符依次append即可。匹配以后将两个指针从当前匹配的敏感词末尾出发，继续向后找下一个敏感词，直到走到字符串的末尾，这个时候我们就认为这个过滤任务完成了。

　　关键的filter算法如下：

/**
     * 过滤敏感词，三个指针算法
     * @param text
     * @return
     */
    public String filter(String text) {
        if (StringUtils.isBlank(text)) {
            return text;
        }

        StringBuilder result = new StringBuilder();
        String replacement = "***";
        TrieNode tempNode = rootNode;
        int begin = 0;
        int position = 0;

        while (position < text.length()) {
            char c = text.charAt(position);
            //遇到特殊字符，跳过
            if (isSymbol(c)) {
                if (tempNode == rootNode) {
                    result.append(c);
                    begin++;
                }
                position++;
                continue;
            }

            tempNode = tempNode.getSubNode(c);
            //没有敏感词，两个指针都往后走
            if (tempNode == null) {
                result.append(text.charAt(begin));
                position = begin + 1;
                begin = position;
                tempNode = rootNode;
            } else if (tempNode.isKeyWordEnd()) {
                //发现敏感词
                result.append(replacement);
                position = position + 1;
                begin = position;
                tempNode = rootNode;
            } else {
                position++;
            }
        }
        result.append(text.substring(begin));
        return result.toString();
    }

完整的算法如下：

package com.nowcoder.service;


import org.apache.commons.lang.CharUtils;
import org.apache.commons.lang.StringUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.springframework.beans.factory.InitializingBean;
import org.springframework.stereotype.Service;
import sun.awt.Symbol;

import java.io.BufferedInputStream;
import java.io.BufferedReader;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.util.HashMap;
import java.util.Map;

@Service
public class SensitiveService implements InitializingBean {
    private static final Logger logger = LoggerFactory.getLogger(SensitiveService.class);
    @Override
    public void afterPropertiesSet() throws Exception {
        try {
            InputStream is = Thread.currentThread().getContextClassLoader().getResourceAsStream("SensitiveWords.txt");
            InputStreamReader read = new InputStreamReader(is);
            BufferedReader bufferedReader = new BufferedReader(read);
            String lineText;
            while ((lineText = bufferedReader.readLine()) != null) {
                //由读取到的文件生成字典树
                addWord(lineText.trim());
            }
            read.close();
        } catch (Exception e) {
            logger.error("读取敏感词文件失败");
        }

    }

    /**
     * 根据输入的敏感词文件构建字典树
     * @param lineTxt
     */
    private void addWord(String lineTxt) {
        TrieNode tempNode = rootNode;
        for (int i = 0; i < lineTxt.length(); i++) {
            Character c = lineTxt.charAt(i);
            if (isSymbol(c)) {
                continue;
            }
            TrieNode node = tempNode.getSubNode(c);

            if (node == null) {
                node = new TrieNode();
                tempNode.addSubNode(c, node);
            }

            tempNode = node;

            if (i == lineTxt.length() - 1) {
                tempNode.setkeywordEnd(true);
            }
        }
    }


    private class TrieNode {
        //是不是关键词的结尾
        private boolean end = false;
        //当前结点下所有的子节点
        private Map<Character, TrieNode> subNodes = new HashMap<Character, TrieNode>();

        public void addSubNode(Character key, TrieNode node) {
            subNodes.put(key, node);
        }

        TrieNode getSubNode(Character key) {
            return subNodes.get(key);
        }

        boolean isKeyWordEnd() {
            return end;
        }

        void setkeywordEnd(boolean end) {
            this.end = end;
        }
    }

    private TrieNode rootNode = new TrieNode();

    //判断是不是英文字母和东亚文字之外的特殊字符
    private boolean isSymbol(char c) {
        int ic = (int)c;
        //东亚文字 0x2e80-0x9fff
        return !CharUtils.isAsciiAlphanumeric(c) && (ic < 0x2E80 || ic > 0x9FFF);
    }

    /**
     * 过滤敏感词，三个指针算法
     * @param text
     * @return
     */
    public String filter(String text) {
        if (StringUtils.isBlank(text)) {
            return text;
        }

        StringBuilder result = new StringBuilder();
        String replacement = "***";
        TrieNode tempNode = rootNode;
        int begin = 0;
        int position = 0;

        while (position < text.length()) {
            char c = text.charAt(position);
            //遇到特殊字符，跳过
            if (isSymbol(c)) {
                if (tempNode == rootNode) {
                    result.append(c);
                    begin++;
                }
                position++;
                continue;
            }

            tempNode = tempNode.getSubNode(c);
            //没有敏感词，两个指针都往后走
            if (tempNode == null) {
                result.append(text.charAt(begin));
                position = begin + 1;
                begin = position;
                tempNode = rootNode;
            } else if (tempNode.isKeyWordEnd()) {
                //发现敏感词
                result.append(replacement);
                position = position + 1;
                begin = position;
                tempNode = rootNode;
            } else {
                position++;
            }
        }
        result.append(text.substring(begin));
        return result.toString();
    }
}