C# 使用DFA实现敏感词过滤算法

DFA即确定性有限状态自动机，用于实现状态之间的自动转移。 与DFA对应的还有一个NFA非确定有限状态自动机，二者统称为有限自动状态机FSM。它们的主要区别在于从一个状态转移的时候是否能唯一确定下一个状态。NFA在转移的时候往往不是转移到某一个确定状态，而是某个状态集合，其中的任一状态都可作为下一个状态，而DFA则是确定的。

DFA的组成：

一个非空有限状态集合 Q
一个输入集合 E
状态转移函数 f
初始状态 q0 为Q的一个元素
终止状态集合 Z 为Q的子集

一个DFA可以写成M=(Q, E, f, q0, Z)

使用DFA来实现一个敏感词过滤算法

现假设有NND, CNM, MLGB三个敏感词，则：
Q = {N, NN, NND, C, CN, CNM, M, ML, MLG, MLGB}, 以所有敏感词的组成作为状态集合，状态机只需在这些状态之间转移即可
E = {B, C, D, G, L, N, M}，以所有组成敏感词的单个字符作为输入集合，状态机只需识别构成敏感词的字符。
qo = null 初始状态为空，为空的初态可以转移到任意状态
Z = {NND, CNM, MLGB} 识别到任意一个敏感词，状态转移就可以终止了。
那么f 就可以是一个读入一个字符后查询是否为Q中的状态进而转移的函数，则转移过程为
f(null, N) = N, f(N, N) = NN, f(NN, D) = NND
f(null, C) = C, f(C, N) = CN, f(CN, M) = CNM
f(null, M) = M , f(M, L) = ML, f(ML, G) = MLG, f(MLG, B) = MLGB

具体实现

构建一个WordNode类型，其中定义一个Dictionary类型的字段Directionart<int, WordNode> node字段，所有敏感词都将作为一个WordNode
将所有敏感词按照拆分首字符为key的形式存储在这个node表中，将拆分之后的部分作为一个新的WordNode，并继续拆分，直至字符长度为1。最终构建出一棵多叉树结构：

WordNode (N, C, M) ->
->WordNode (N) -> WordNode (N) -> WordNode(D)
->WordNode (C) -> WordNode (N) -> WordNode(M)
->WordNode (M) -> WordNode (L) -> WordNode(G) -> WordNode(B)

如此状态函数f只需按照读入的字符，查询所有 WordNode字典就可以进行状态转移了。

using System;
using System.Collections;
using System.Collections.Generic;
public class FilterHelper
{
    private static FilterHelper instance;
    public static FilterHelper Instance
    {
        get
        {
            if( instance == null )
            {
                instance = new FilterHelper();
            }
            return instance;
        }
    }

    private WordNode wordNodes;

    public void InitDictionary(string[] words)
    {
        if (wordNodes == null)
        {
            wordNodes = new WordNode();
            for (int i=0,Imax = words.Length;i<Imax;i++)
            {
                string word = words[i];
                string dbcWord = this.ToDBC(word);

                //转繁体字
                //wordList.Add(Microsoft.VisualBasic.Strings.StrConv(key, Microsoft.VisualBasic.VbStrConv.TraditionalChinese, 0));
                if (dbcWord.Length > 0)
                {
                    wordNodes.Add(word);
                }
            }
        }
    }

    public struct RNode
    {
        public int start;
        public int len;
        public int type;//0为敏感词替换 1为数字序列替换
    }

    public bool CheckBadWord(string sourceText, int filterNum= 0)
    {
        if (sourceText != string.Empty)
        {
            string sourceDBCText = ToDBC(sourceText);
            for (int i = 0; i < sourceDBCText.Length; i++)
            {
                int badWordLen = 0;
                if (filterNum > 0 && isNum(sourceDBCText[i]))
                {
                    badWordLen = CheckNumberSeq(sourceDBCText, i, filterNum);
                    if (badWordLen > 0)
                    {
                        return true;
                    }
                }
                //查询以该字为首字符的词组  
                badWordLen = Check(sourceDBCText, i);
                if (badWordLen > 0)
                {
                    return true;
                }
            }
            //double timeCost = (System.DateTime.Now - timeStart).TotalMilliseconds;
            //Debug.LogError("timeCost:" + timeCost + " ms");
        }
        return false;
    }


    public string FilterWithChar(string sourceText, char replaceChar, int filterNum = 0,string numReplace = null)
    {
        //var timeStart = System.DateTime.Now;
        if (sourceText != string.Empty)
        {
            string sourceDBCText = ToDBC(sourceText);

            char[] tempString = sourceText.ToCharArray();
            List<RNode> replaceList = new List<RNode>();

            for (int i = 0; i < sourceDBCText.Length; i++)
            {
                int badWordLen = 0;
                if (filterNum > 0 && isNum(sourceDBCText[i]))
                {
                    badWordLen = CheckNumberSeq(sourceDBCText, i, filterNum);
                    if (badWordLen > 0)
                    {
                        badWordLen = badWordLen + 1;
                        if (numReplace == null)
                        {
                            for (int pos = 0; pos < badWordLen; pos++)
                            {
                                tempString[pos + i] = replaceChar;
                            }
                        }
                        else
                        { 
                            replaceList.Add(new RNode { start = i, len = badWordLen ,type = 1 });
                        }
                        i = i + badWordLen - 1;
                        continue;
                    }
                }

                //查询以该字为首字符的词组  
                badWordLen = Check(sourceDBCText, i);
                if(badWordLen > 0)
                {
                    for (int pos = 0; pos < badWordLen; pos++)
                    {
                        tempString[pos + i] = replaceChar;
                    }
                    i = i + badWordLen - 1;
                }
            }
            string result;
            if(replaceList.Count > 0)
            {
                result = replaceString(tempString, replaceList, null, numReplace);
            }else
            {
                result = new string(tempString);
            }
            //double timeCost = (System.DateTime.Now - timeStart).TotalMilliseconds;
            //Debug.LogError("timeCost:" + timeCost + " ms");
            return result;
        }
        else
        {
            return string.Empty;
        }
    }

    public string FilterWithStr(string sourceText, string replaceStr,int filterNum = 0,string numReplace=null)
    {
        //var timeStart = System.DateTime.Now;
        if (sourceText != string.Empty)
        {
            string sourceDBCText = ToDBC(sourceText);
            List<RNode> replaceList = new List<RNode>();

            if(filterNum>0 && numReplace == null)
            {
                numReplace = replaceStr;
            }
            int badWordLen = 0;
            for (int i = 0; i < sourceDBCText.Length; i++)
            {
                if (filterNum>0 && isNum(sourceDBCText[i]))
                {
                    badWordLen = CheckNumberSeq(sourceDBCText, i, filterNum);
                    if (badWordLen > 0)
                    {
                        badWordLen = badWordLen + 1;
                        int start = i;
                        replaceList.Add(new RNode { start = start, len = badWordLen, type = 1 });
                        i = i + badWordLen - 1;
                        continue;
                    }
                }

                badWordLen = Check(sourceDBCText, i);
                if (badWordLen > 0)
                {
                    replaceList.Add(new RNode { start = i, len = badWordLen, type = 0 });
                    i = i + badWordLen - 1;
                }
            }
            string tempStr = replaceString(sourceText.ToCharArray(), replaceList, replaceStr, numReplace);
            //double timeCost = (System.DateTime.Now - timeStart).TotalMilliseconds;
            //Debug.LogError("timeCost:" + timeCost + " ms");
            return tempStr;
        }
        else
        {
            return string.Empty;
        }
    }

    private string replaceString(char[] charArry, List<RNode> nodes, string replaceStr, string numReplace)
    {
        if(numReplace == null)
        {
            numReplace = replaceStr;
        }
        if(replaceStr == null)
        {
            replaceStr = numReplace;
        }

        List<char> charList = new List<char>(charArry);
        int offset = 0;
        for (int i=0,Imax=nodes.Count;i<Imax;i++)
        {
            int start = nodes[i].start + offset;
            int len = nodes[i].len;
            int endIndex = start + len - 1;
            string str = nodes[i].type == 0 ? replaceStr : numReplace;

            if (str.Length < len)
            {
                charList.RemoveRange(start, len - str.Length);
            }
            for (int j = 0,Jmax = str.Length ; j < Jmax; j++)
            {
                char ch = str[j];
                int index = start + j;
                if(index <= endIndex)
                {
                    charList[index] = ch;
                }else
                {
                    charList.Insert(index, ch);
                }
            }
            offset += str.Length - len;
        }
        return new string(charList.ToArray());
    }

    private int Check(string sourceText,int cursor)
    {
        int endsor = wordNodes.checkAndGetEndIndex(sourceText, cursor, checkSpecialSym);
        int wordLength = endsor >= cursor ? endsor - cursor + 1 : 0;
        return wordLength;
    }

    private int CheckNumberSeq(string sourceText,int cursor,int filterNum)
    {
        int count = 0;
        int offset = 0;
        if (cursor + 1 >= sourceText.Length)
        {
            return 0;
        }
        //检测下位字符如果不是汉字 数字 字符 偏移量加1  
        for (int i = cursor + 1; i < sourceText.Length; i++)
        {
            //if(checkSpecialSym(sourceText[i]))
            //{
            //    offset++;
            //}
            //else 
            if (!isNum(sourceText[i]))
            {
                break;
            }
            else
            {
                count++;
            }
        }
        if(count + 1 >= filterNum)
        {
            int wordLength = count + offset;
            return wordLength;
        }else
        {
            return 0;
        }
    }

    #region helper

    private bool checkSpecialSym(char character)
    {
        return !isCHS(character) && !isNum(character) && !isAlphabet(character);
    }

    private bool isCHS(char character)
    {
        //  中文表意字符的范围 4E00-9FA5  
        int charVal = (int)character;
        return (charVal >= 0x4e00 && charVal <= 0x9fa5);
    }

    private bool isNum(char character)
    {
        int charVal = (int)character;
        return (charVal >= 48 && charVal <= 57);
    }

    private bool isAlphabet(char character)
    {
        int charVal = (int)character;
        return ((charVal >= 97 && charVal <= 122) || (charVal >= 65 && charVal <= 90));
    }


    /// <summary>
    /// 转半角小写的函数(DBC case)
    /// </summary>
    /// <param name="input">任意字符串</param>
    /// <returns>半角字符串</returns>
    ///<remarks>
    ///全角空格为12288，半角空格为32
    ///其他字符半角(33-126)与全角(65281-65374)的对应关系是：均相差65248
    ///</remarks>
    private string ToDBC(string input)
    {
        char[] c = input.ToCharArray();
        for (int i = 0; i < c.Length; i++)
        {
            if (c[i] == 12288)
            {
                c[i] = (char)32;
                continue;
            }
            if (c[i] > 65280 && c[i] < 65375)
                c[i] = (char)(c[i] - 65248);
        }
        return new string(c).ToLower();
    }
    #endregion
}

/// <summary>
/// 有限状态自动机DFA算法
/// M=(Q, E, f, q0, Z)
/// Q = 状态集合 即 此处所有的 node 字典  再加一个 空集
/// E = 输入字符集合 即 所有敏感词所包含 单个字符 的集合
/// f = 状态转换函数 即 单个敏感词字符作为Key 在 （所有node+空集）之间的索引关系 与空集索引返回其本身。（即E作为Key在Q中的索引关系）
/// q0 = 唯一初态 即 空集，所有字符索引空集都返回其本身
/// Z = 输出态集合 即 所有敏感词
/// 复杂度为n*logn n为识别文本的长度
/// </summary>
class WordNode
{
    private Dictionary<int, WordNode> node;

    byte isEnd = 0;

    public WordNode()
    {
        node = new Dictionary<int, WordNode>();
    }

    public void Add(string word)
    {
        if (word.Length > 0)
        {
            int key = (int)word[0];
            if (!node.ContainsKey(key))
            {
                node.Add(key, new WordNode());
            }
            WordNode subNode = node[key];
            if(word.Length > 1)
            {
                subNode.Add(word.Substring(1));
            }
            else
            {
                subNode.isEnd = 1;
            }
        }
    }

    public int checkAndGetEndIndex(string sourceDBCText, int cursor, Func<char,bool> checkSpecialSym)
    {
        //检测下位字符如果不是汉字 数字 字符 偏移量加1  
        for (int i = cursor; i < sourceDBCText.Length; i++)
        {
            if (checkSpecialSym != null && checkSpecialSym(sourceDBCText[i]))
            {
                cursor++;
            }
            else break;
        }
        if(cursor >= sourceDBCText.Length) { return -1; }

        int key = sourceDBCText[cursor];
        if (node.ContainsKey(key))
        {
            WordNode group = node[key];
            if(group.isEnd == 1)
            {
                return cursor;
            }
            return node[key].checkAndGetEndIndex(sourceDBCText, cursor + 1,checkSpecialSym);
        }
        return -1;
    }
}

— 于 2019年05月19日，共写了7389个字；

— 文内使用到的标签：敏感词过滤

上一篇: UGUI:一个可以自定义item尺寸的循环滚动框
下一篇: 统一管理小红点方案

木骰

C# 使用DFA实现敏感词过滤算法

DFA的组成：

使用DFA来实现一个敏感词过滤算法

具体实现

发表评论取消回复

DFA的组成：

使用DFA来实现一个敏感词过滤算法

具体实现

发表评论 取消回复

发表评论取消回复