You've already forked linux-packaging-mono
							
							
		
			
	
	
		
			112 lines
		
	
	
		
			3.2 KiB
		
	
	
	
		
			C++
		
	
	
	
	
	
		
		
			
		
	
	
			112 lines
		
	
	
		
			3.2 KiB
		
	
	
	
		
			C++
		
	
	
	
	
	
|   | //===-- TrigramIndex.cpp - a heuristic for SpecialCaseList ----------------===//
 | ||
|  | //
 | ||
|  | //                     The LLVM Compiler Infrastructure
 | ||
|  | //
 | ||
|  | // This file is distributed under the University of Illinois Open Source
 | ||
|  | // License. See LICENSE.TXT for details.
 | ||
|  | //
 | ||
|  | //===----------------------------------------------------------------------===//
 | ||
|  | //
 | ||
|  | // TrigramIndex implements a heuristic for SpecialCaseList that allows to
 | ||
|  | // filter out ~99% incoming queries when all regular expressions in the
 | ||
|  | // SpecialCaseList are simple wildcards with '*' and '.'. If rules are more
 | ||
|  | // complicated, the check is defeated and it will always pass the queries to a
 | ||
|  | // full regex.
 | ||
|  | //
 | ||
|  | //===----------------------------------------------------------------------===//
 | ||
|  | 
 | ||
|  | #include "llvm/Support/TrigramIndex.h"
 | ||
|  | #include "llvm/ADT/SmallVector.h"
 | ||
|  | 
 | ||
|  | #include <set>
 | ||
|  | #include <string>
 | ||
|  | #include <unordered_map>
 | ||
|  | 
 | ||
|  | using namespace llvm; | ||
|  | 
 | ||
|  | static const char RegexAdvancedMetachars[] = "()^$|+?[]\\{}"; | ||
|  | 
 | ||
|  | static bool isAdvancedMetachar(unsigned Char) { | ||
|  |   return strchr(RegexAdvancedMetachars, Char) != nullptr; | ||
|  | } | ||
|  | 
 | ||
|  | void TrigramIndex::insert(std::string Regex) { | ||
|  |   if (Defeated) return; | ||
|  |   std::set<unsigned> Was; | ||
|  |   unsigned Cnt = 0; | ||
|  |   unsigned Tri = 0; | ||
|  |   unsigned Len = 0; | ||
|  |   bool Escaped = false; | ||
|  |   for (unsigned Char : Regex) { | ||
|  |     if (!Escaped) { | ||
|  |       // Regular expressions allow escaping symbols by preceding it with '\'.
 | ||
|  |       if (Char == '\\') { | ||
|  |         Escaped = true; | ||
|  |         continue; | ||
|  |       } | ||
|  |       if (isAdvancedMetachar(Char)) { | ||
|  |         // This is a more complicated regex than we can handle here.
 | ||
|  |         Defeated = true; | ||
|  |         return; | ||
|  |       } | ||
|  |       if (Char == '.' || Char == '*') { | ||
|  |         Tri = 0; | ||
|  |         Len = 0; | ||
|  |         continue; | ||
|  |       } | ||
|  |     } | ||
|  |     if (Escaped && Char >= '1' && Char <= '9') { | ||
|  |       Defeated = true; | ||
|  |       return; | ||
|  |     } | ||
|  |     // We have already handled escaping and can reset the flag.
 | ||
|  |     Escaped = false; | ||
|  |     Tri = ((Tri << 8) + Char) & 0xFFFFFF; | ||
|  |     Len++; | ||
|  |     if (Len < 3) | ||
|  |       continue; | ||
|  |     // We don't want the index to grow too much for the popular trigrams,
 | ||
|  |     // as they are weak signals. It's ok to still require them for the
 | ||
|  |     // rules we have already processed. It's just a small additional
 | ||
|  |     // computational cost.
 | ||
|  |     if (Index[Tri].size() >= 4) | ||
|  |       continue; | ||
|  |     Cnt++; | ||
|  |     if (!Was.count(Tri)) { | ||
|  |       // Adding the current rule to the index.
 | ||
|  |       Index[Tri].push_back(Counts.size()); | ||
|  |       Was.insert(Tri); | ||
|  |     } | ||
|  |   } | ||
|  |   if (!Cnt) { | ||
|  |     // This rule does not have remarkable trigrams to rely on.
 | ||
|  |     // We have to always call the full regex chain.
 | ||
|  |     Defeated = true; | ||
|  |     return; | ||
|  |   } | ||
|  |   Counts.push_back(Cnt); | ||
|  | } | ||
|  | 
 | ||
|  | bool TrigramIndex::isDefinitelyOut(StringRef Query) const { | ||
|  |   if (Defeated) | ||
|  |     return false; | ||
|  |   std::vector<unsigned> CurCounts(Counts.size()); | ||
|  |   unsigned Tri = 0; | ||
|  |   for (size_t I = 0; I < Query.size(); I++) { | ||
|  |     Tri = ((Tri << 8) + Query[I]) & 0xFFFFFF; | ||
|  |     if (I < 2) | ||
|  |       continue; | ||
|  |     const auto &II = Index.find(Tri); | ||
|  |     if (II == Index.end()) | ||
|  |       continue; | ||
|  |     for (size_t J : II->second) { | ||
|  |       CurCounts[J]++; | ||
|  |       // If we have reached a desired limit, we have to look at the query
 | ||
|  |       // more closely by running a full regex.
 | ||
|  |       if (CurCounts[J] >= Counts[J]) | ||
|  |         return false; | ||
|  |     } | ||
|  |   } | ||
|  |   return true; | ||
|  | } |