即日起在codingBlog上分享您的技术经验即可获得积分,积分可兑换现金哦。

C++ 敏感词屏蔽

编程语言 nightwizard2030 50℃ 0评论

以前做ASP.NET的时候接触过敏感词屏蔽,在c#那种强大的框架下,直接用切词工具再加一个敏感词字典就搞定一切。回到c++,我肯定不会因为搞个敏感词就去引入个框架,第一追求效率,第二看别人写的c++代码也挺痛苦的。接下来,就讲解下对于敏感词屏蔽的具体思路与代码。


首先要解决的问题是敏感词的存储形式,这就涉及数据结构,先想想搜索屏蔽要怎么处理,比如我有一个content,我就遍历它每个字符,先看与词典中所有词第一个字符相同的,再看第二个,再看第三个.等等。那么,很明显,这就需要一种以层来存储的数据结构–树来存储敏感词汇。我首先设计了一个Node,它要存储同一级的node指针,下一级的node指针,标识词的结束,数据。最开始本来只想到用树的结构,最后发现, 这居然就是个二叉树,可以抽象成左边是next,右边就是sibling,那问题就简单了。


我的代码实现非常简单明了,而且中文那些都完全没问题,不像其他人弄的那么复杂,而且随便测试,毫无BUG


#include
#include
#include
#include
#include
using namespace std;

struct Node{
 char data;
 bool last;
 Node* sibling;
 Node* next;
};

void GetKeyWordList(list& list_keyword){
 ifstream in;
 stringstream ss;
 in.open("keyword.txt");//read keyword all text
 ss<::iterator it_keyword;
 
 char* temp=strtok(content,"@");//delim is @
 char* temp2=0;
 while(temp!=0){
  bool exist=false;
  for(list::iterator it_keyword=list_keyword.begin();it_keyword!=list_keyword.end();++it_keyword){
   if(strcmp(*it_keyword,temp)==0){
    exist=true;
    break;
   }
  }
  if(!exist){//push keyword to list except the repeated keyword
   temp2=new char[strlen(temp)+1];
   memcpy(temp2,temp,strlen(temp));
   temp2[strlen(temp)]=0;
   list_keyword.push_back(temp2);
  }
  temp=strtok(0,"@");
 }
}

void ReleaseKeyWordList(list& list_keyword){
 for(list::iterator it_keyword=list_keyword.begin();it_keyword!=list_keyword.end();++it_keyword){
  delete *it_keyword;
 }
}

Node* NodeHasExist(Node* cur,char data){
 while(cur!=0){
  if(cur->data==data){
   return cur;
  }
  cur=cur->sibling;
 }
 return 0;
}


Node* BuildTree(){
 list list_keyword;
 
 GetKeyWordList(list_keyword);
 
 Node* root=new Node();//make keyword tree by list
 root->next=0;
 root->sibling=0;
 root->data=0;
 Node* cur=root;
 for(list::iterator it_keyword=list_keyword.begin();it_keyword!=list_keyword.end();++it_keyword){//traversal list 
  int length=strlen(*it_keyword);
  for(int i=0;inext,(*it_keyword)[i]);//check char node does exist in cur->next link list
   
   if(node==0){
    node=new Node();
    node->data=(*it_keyword)[i];
    node->last=false;
    node->next=0;
    node->sibling=0;
    if(cur->next==0){
     cur->next=node;//make node become cur->next's first
    }
    else{
     Node* temp=cur->next;
     while(temp->sibling!=0){//make point move to cur->next's last
      temp=temp->sibling;
     }
     temp->sibling=node;//add node at cur-next's last
    }   
   }
   
   //if node exist,skip create node again

   cur=node;//make cur point at node 
  }
  cur->last=true;//if is the end of word,set sign at the last,because it may has many branches
  cur=root;
 }
 
 ReleaseKeyWordList(list_keyword);
 
 return root;
}

void ReleaseTree(Node* root){//post traversal delete
 if(root!=0){
  ReleaseTree(root->next);
  ReleaseTree(root->sibling); 
  delete root;
 }
}                 
                    
void LookUp(char* content,Node* root){
 Node* cur=root->next;
 int start=-1;
 stringstream ss;
 bool shouldReview=false;
 for(int i=0;idata==ch){
    hasfind=true;
    if(start==-1){//first time find char ,record first time index
     start=i;
    }
    if(cur->last&&cur->next==0){//it has arrived at last branch
     start=-1;
     ss<<"*";
     cur=root->next;
     shouldReview=false; 
    }
    else if(cur->last){//it may not arrived at last branch
     start=i+1;
     ss<<"*";
     cur=cur->next;
     shouldReview=true;
    }
    else{
     cur=cur->next;
    }
    break;
   }
   cur=cur->sibling;
  }
  
  if(shouldReview&&!hasfind){
   i=start-1;//this time is checking cur->next,it should also check root->next,start-1 because of i++
   shouldReview=false;
   start=-1;
   cur=root->next;
  }
  else if(!hasfind){
   if(start>-1){
    i=start;//start must have checked with root->next,so it was not used to be -1
   }
   ss<next;
  }
 }
 
 if(start>-1&&!shouldReview){
  for(int j=start;j








敏感词库:




测试结果:





转载请注明:CodingBlog » C++ 敏感词屏蔽

喜欢 (0)or分享 (0)
发表我的评论
取消评论

*

表情