Source code for dlk.utils.quick_search

# Copyright 2021 cstsunfu. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import ahocorasick
from typing import Iterable, List, Dict

[docs]class QuickSearch(object): """Ahocorasick enhanced Trie""" def __init__(self, words: Iterable=[]): """init the QuickSearch Args: words: tokens in use words to update the trie Returns: None """ super(QuickSearch, self).__init__() if not isinstance(words, Iterable) : raise PermissionError("The words must be a Iterable object.") self.next_index = 0 self.ac = ahocorasick.Automaton() for sub_str in words: sub_str = f" {sub_str.strip()} " self.ac.add_word(sub_str, (self.next_index, sub_str)) self.next_index += 1 self.ac.make_automaton()
[docs] def search(self, search_str:str)->List[Dict]: """find whether some sub_str in trie Args: search_str: find the search_str Returns: >>> list of result: >>> the result organized as { >>> "start": start_position, >>> "end": end_position, >>> "str": search_str[start_position: end_position] >>> } """ result = [] for end_index, (_, original_value) in self.ac.iter(search_str): start_index = end_index - len(original_value) + 1 end_index = end_index + 1 result.append({"start": start_index, "end": end_index, "str": original_value}) return result
[docs] def has(self, key: str)->bool: """check key is in trie Args: key: a token(str) Returns: bool(has or not) """ try: self.ac.get(key) except: return False return True
[docs] def add_word(self, word: str): """add a single word to trie Args: word: single token Returns: None """ self.ac.add_word(word, (self.next_index, word)) self.next_index += 1 self.ac.make_automaton()
[docs] def add_words(self, words: Iterable): """add words from iterator to the trie Args: words: Iterable[tokens] Returns: None """ for sub_str in words: self.ac.add_word(sub_str, (self.next_index, sub_str)) self.next_index += 1 self.ac.make_automaton()