木骰

C# 使用DFA实现敏感词过滤算法

DFA即确定性有限状态自动机,用于实现状态之间的自动转移。 与DFA对应的还有一个NFA非确定有限状态自动机,二者统称为有限自动状态机FSM。它们的主要区别在于 从一个状态转移的时候是否能唯一确定下一个状态。NFA在转移的时候往往不是转移到某一个确定状态,而是某个状态集合,其中的任一状态都可作为下一个状态,而DFA则是确定的。

DFA的组成:

一个非空有限状态集合 Q
一个输入集合 E
状态转移函数 f
初始状态 q0 为Q的一个元素
终止状态集合 Z 为Q的子集

一个DFA可以写成M=(Q, E, f, q0, Z)

使用DFA来实现一个敏感词过滤算法

现假设有NND, CNM, MLGB三个敏感词,则:
Q = {N, NN, NND, C, CN, CNM, M, ML, MLG, MLGB}, 以所有敏感词的组成作为状态集合,状态机只需在这些状态之间转移即可
E = {B, C, D, G, L, N, M}, 以所有组成敏感词的单个字符作为输入集合,状态机只需识别构成敏感词的字符。
qo = null 初始状态为空,为空的初态可以转移到任意状态
Z = {NND, CNM, MLGB} 识别到任意一个敏感词, 状态转移就可以终止了。
那么f 就可以是一个 读入一个字符后查询是否为Q中的状态进而转移的函数,则转移过程为
f(null, N) = N, f(N, N) = NN, f(NN, D) = NND
f(null, C) = C, f(C, N) = CN, f(CN, M) = CNM
f(null, M) = M , f(M, L) = ML, f(ML, G) = MLG, f(MLG, B) = MLGB

具体实现

构建一个WordNode类型,其中定义一个Dictionary类型的字段Directionart<int, WordNode> node字段,所有敏感词都将作为一个WordNode
将所有敏感词 按照拆分首字符为key的形式存储在这个node表中, 将拆分之后的部分作为一个新的WordNode,并继续拆分,直至字符长度为1。最终构建出一棵多叉树结构:

WordNode (N, C, M) ->
->WordNode (N) -> WordNode (N) -> WordNode(D)
->WordNode (C) -> WordNode (N) -> WordNode(M)
->WordNode (M) -> WordNode (L) -> WordNode(G) -> WordNode(B)

如此状态函数f只需按照读入的字符, 查询 所有 WordNode字典 就可以进行状态转移了。

using System;
using System.Collections;
using System.Collections.Generic;
public class FilterHelper
{
private static FilterHelper instance;
public static FilterHelper Instance
{
get
{
if( instance == null )
{
instance = new FilterHelper();
}
return instance;
}
}
private WordNode wordNodes;
public void InitDictionary(string[] words)
{
if (wordNodes == null)
{
wordNodes = new WordNode();
for (int i=0,Imax = words.Length;i<Imax;i++)
{
string word = words[i];
string dbcWord = this.ToDBC(word);
//转繁体字
//wordList.Add(Microsoft.VisualBasic.Strings.StrConv(key, Microsoft.VisualBasic.VbStrConv.TraditionalChinese, 0));
if (dbcWord.Length > 0)
{
wordNodes.Add(word);
}
}
}
}
public struct RNode
{
public int start;
public int len;
public int type;//0为敏感词替换 1为数字序列替换
}
public bool CheckBadWord(string sourceText, int filterNum= 0)
{
if (sourceText != string.Empty)
{
string sourceDBCText = ToDBC(sourceText);
for (int i = 0; i < sourceDBCText.Length; i++)
{
int badWordLen = 0;
if (filterNum > 0 && isNum(sourceDBCText[i]))
{
badWordLen = CheckNumberSeq(sourceDBCText, i, filterNum);
if (badWordLen > 0)
{
return true;
}
}
//查询以该字为首字符的词组
badWordLen = Check(sourceDBCText, i);
if (badWordLen > 0)
{
return true;
}
}
//double timeCost = (System.DateTime.Now - timeStart).TotalMilliseconds;
//Debug.LogError("timeCost:" + timeCost + " ms");
}
return false;
}
public string FilterWithChar(string sourceText, char replaceChar, int filterNum = 0,string numReplace = null)
{
//var timeStart = System.DateTime.Now;
if (sourceText != string.Empty)
{
string sourceDBCText = ToDBC(sourceText);
char[] tempString = sourceText.ToCharArray();
List<RNode> replaceList = new List<RNode>();
for (int i = 0; i < sourceDBCText.Length; i++)
{
int badWordLen = 0;
if (filterNum > 0 && isNum(sourceDBCText[i]))
{
badWordLen = CheckNumberSeq(sourceDBCText, i, filterNum);
if (badWordLen > 0)
{
badWordLen = badWordLen + 1;
if (numReplace == null)
{
for (int pos = 0; pos < badWordLen; pos++)
{
tempString[pos + i] = replaceChar;
}
}
else
{
replaceList.Add(new RNode { start = i, len = badWordLen ,type = 1 });
}
i = i + badWordLen - 1;
continue;
}
}
//查询以该字为首字符的词组
badWordLen = Check(sourceDBCText, i);
if(badWordLen > 0)
{
for (int pos = 0; pos < badWordLen; pos++)
{
tempString[pos + i] = replaceChar;
}
i = i + badWordLen - 1;
}
}
string result;
if(replaceList.Count > 0)
{
result = replaceString(tempString, replaceList, null, numReplace);
}else
{
result = new string(tempString);
}
//double timeCost = (System.DateTime.Now - timeStart).TotalMilliseconds;
//Debug.LogError("timeCost:" + timeCost + " ms");
return result;
}
else
{
return string.Empty;
}
}
public string FilterWithStr(string sourceText, string replaceStr,int filterNum = 0,string numReplace=null)
{
//var timeStart = System.DateTime.Now;
if (sourceText != string.Empty)
{
string sourceDBCText = ToDBC(sourceText);
List<RNode> replaceList = new List<RNode>();
if(filterNum>0 && numReplace == null)
{
numReplace = replaceStr;
}
int badWordLen = 0;
for (int i = 0; i < sourceDBCText.Length; i++)
{
if (filterNum>0 && isNum(sourceDBCText[i]))
{
badWordLen = CheckNumberSeq(sourceDBCText, i, filterNum);
if (badWordLen > 0)
{
badWordLen = badWordLen + 1;
int start = i;
replaceList.Add(new RNode { start = start, len = badWordLen, type = 1 });
i = i + badWordLen - 1;
continue;
}
}
badWordLen = Check(sourceDBCText, i);
if (badWordLen > 0)
{
replaceList.Add(new RNode { start = i, len = badWordLen, type = 0 });
i = i + badWordLen - 1;
}
}
string tempStr = replaceString(sourceText.ToCharArray(), replaceList, replaceStr, numReplace);
//double timeCost = (System.DateTime.Now - timeStart).TotalMilliseconds;
//Debug.LogError("timeCost:" + timeCost + " ms");
return tempStr;
}
else
{
return string.Empty;
}
}
private string replaceString(char[] charArry, List<RNode> nodes, string replaceStr, string numReplace)
{
if(numReplace == null)
{
numReplace = replaceStr;
}
if(replaceStr == null)
{
replaceStr = numReplace;
}
List<char> charList = new List<char>(charArry);
int offset = 0;
for (int i=0,Imax=nodes.Count;i<Imax;i++)
{
int start = nodes[i].start + offset;
int len = nodes[i].len;
int endIndex = start + len - 1;
string str = nodes[i].type == 0 ? replaceStr : numReplace;
if (str.Length < len)
{
charList.RemoveRange(start, len - str.Length);
}
for (int j = 0,Jmax = str.Length ; j < Jmax; j++)
{
char ch = str[j];
int index = start + j;
if(index <= endIndex)
{
charList[index] = ch;
}else
{
charList.Insert(index, ch);
}
}
offset += str.Length - len;
}
return new string(charList.ToArray());
}
private int Check(string sourceText,int cursor)
{
int endsor = wordNodes.checkAndGetEndIndex(sourceText, cursor, checkSpecialSym);
int wordLength = endsor >= cursor ? endsor - cursor + 1 : 0;
return wordLength;
}
private int CheckNumberSeq(string sourceText,int cursor,int filterNum)
{
int count = 0;
int offset = 0;
if (cursor + 1 >= sourceText.Length)
{
return 0;
}
//检测下位字符如果不是汉字 数字 字符 偏移量加1
for (int i = cursor + 1; i < sourceText.Length; i++)
{
//if(checkSpecialSym(sourceText[i]))
//{
// offset++;
//}
//else
if (!isNum(sourceText[i]))
{
break;
}
else
{
count++;
}
}
if(count + 1 >= filterNum)
{
int wordLength = count + offset;
return wordLength;
}else
{
return 0;
}
}
#region helper
private bool checkSpecialSym(char character)
{
return !isCHS(character) && !isNum(character) && !isAlphabet(character);
}
private bool isCHS(char character)
{
// 中文表意字符的范围 4E00-9FA5
int charVal = (int)character;
return (charVal >= 0x4e00 && charVal <= 0x9fa5);
}
private bool isNum(char character)
{
int charVal = (int)character;
return (charVal >= 48 && charVal <= 57);
}
private bool isAlphabet(char character)
{
int charVal = (int)character;
return ((charVal >= 97 && charVal <= 122) || (charVal >= 65 && charVal <= 90));
}
/// <summary>
/// 转半角小写的函数(DBC case)
/// </summary>
/// <param name="input">任意字符串</param>
/// <returns>半角字符串</returns>
///<remarks>
///全角空格为12288,半角空格为32
///其他字符半角(33-126)与全角(65281-65374)的对应关系是:均相差65248
///</remarks>
private string ToDBC(string input)
{
char[] c = input.ToCharArray();
for (int i = 0; i < c.Length; i++)
{
if (c[i] == 12288)
{
c[i] = (char)32;
continue;
}
if (c[i] > 65280 && c[i] < 65375)
c[i] = (char)(c[i] - 65248);
}
return new string(c).ToLower();
}
#endregion
}
/// <summary>
/// 有限状态自动机DFA算法
/// M=(Q, E, f, q0, Z)
/// Q = 状态集合 即 此处所有的 node 字典 再加一个 空集
/// E = 输入字符集合 即 所有敏感词所包含 单个字符 的集合
/// f = 状态转换函数 即 单个敏感词字符作为Key 在 (所有node+空集)之间的索引关系 与空集索引返回其本身。(即E作为Key在Q中的索引关系)
/// q0 = 唯一初态 即 空集,所有字符索引空集都返回其本身
/// Z = 输出态集合 即 所有敏感词
/// 复杂度为n*logn n为识别文本的长度
/// </summary>
class WordNode
{
private Dictionary<int, WordNode> node;
byte isEnd = 0;
public WordNode()
{
node = new Dictionary<int, WordNode>();
}
public void Add(string word)
{
if (word.Length > 0)
{
int key = (int)word[0];
if (!node.ContainsKey(key))
{
node.Add(key, new WordNode());
}
WordNode subNode = node[key];
if(word.Length > 1)
{
subNode.Add(word.Substring(1));
}
else
{
subNode.isEnd = 1;
}
}
}
public int checkAndGetEndIndex(string sourceDBCText, int cursor, Func<char,bool> checkSpecialSym)
{
//检测下位字符如果不是汉字 数字 字符 偏移量加1
for (int i = cursor; i < sourceDBCText.Length; i++)
{
if (checkSpecialSym != null && checkSpecialSym(sourceDBCText[i]))
{
cursor++;
}
else break;
}
if(cursor >= sourceDBCText.Length) { return -1; }
int key = sourceDBCText[cursor];
if (node.ContainsKey(key))
{
WordNode group = node[key];
if(group.isEnd == 1)
{
return cursor;
}
return node[key].checkAndGetEndIndex(sourceDBCText, cursor + 1,checkSpecialSym);
}
return -1;
}
}
— 于 共写了7389个字
— 文内使用到的标签:

发表评论

电子邮件地址不会被公开。 必填项已用*标注

*