Source code for dlk.utils.vocab

# Copyright 2021 cstsunfu. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from collections import Counter
from typing import Dict, Iterable, List, Union
import pandas as pd


[docs]class Vocabulary(object):
    """generate vocab from tokens(token or Iterable tokens)
       you can dumps the object to dict and load from dict
    """

    def __init__(self, do_strip: bool=False, unknown: str='', ignore: str="", pad: str=''):
        self.word2idx = {}
        self.idx2word = {}
        self.do_strip = do_strip
        self.word_num = 0
        self.word_count = Counter() # reserved
        self.unknown = unknown
        self.ignore = ignore
        self.pad = pad
        if ignore:
            self.word2idx[ignore] = -1
            self.idx2word[-1] = ignore
            self.word_count[ignore] += int(1e10)
        if pad:
            assert self.word_num == 0, f"The pad id must be 0"
            self.word_count[pad] += int(1e10)
            self.word2idx[pad] = self.word_num
            self.idx2word[self.word_num] = pad
            self.word_num += 1
        if unknown:
            self.word_count[unknown] += int(1e10)
            self.word2idx[unknown] = self.word_num
            self.idx2word[self.word_num] = unknown
            self.word_num += 1

[docs]    def dumps(self)->Dict:
        """dumps the object to dict

        Returns: 
            self.__dict__

        """
        return self.__dict__

[docs]    @classmethod
    def load(cls, attr: Dict):
        """load the object from dict

        Args:
            attr: self.__dict__

        Returns: 
            initialized Vocabulary

        """
        vocab = cls()
        vocab.__dict__ = attr
        return vocab

    def __getitem__(self, index: int):
        """get the token by index

        Args:
            index: token index

        Returns: 
            `word` which index is geven, if index is not out of range

        Raises:
            KeyError

        """
        return self.idx2word[int(index)]

[docs]    def auto_get_index(self, data: Union[str, List]):
        """get the index of word ∈data from this vocab

        Args:
            data: auto detection

        Returns: 
            type the same as data

        """
        if isinstance(data, str):
            return self.get_index(data)
        elif isinstance(data, list):
            return [self.auto_get_index(subdata) for subdata in data]
        else:
            raise ValueError("Don't support the type of {}".format(data))

[docs]    def get_index(self, word: str)->int:
        """get the index of word from this vocab

        Args:
            word: a single token

        Returns: 
            index

        """
        if self.do_strip:
            word = word.strip()
        try:
            return self.word2idx[word]
        except:
            if self.unknown:
                return self.word2idx[self.unknown]
            else:
                raise KeyError('Unkown word: {}'.format(word))

[docs]    def filter_rare(self, min_freq=1, most_common=-1):
        """filter the words which count is to small.

        min_freq and most_common can not set all

        Args:
            min_freq: minist frequency
            most_common: most common number, -1 means all

        Returns: 
            None

        """
        self.word2idx = {}
        self.idx2word = {}
        assert min_freq == 1 or most_common==-1, "You should set the min_freq=1 or most_common=-1."
        if most_common != -1:
            for i, (token, freq) in enumerate(self.word_count.most_common(most_common)):
                self.word2idx[token] = i
                self.idx2word[i] = token
        else:
            index = 0
            for token in self.word_count:
                if self.word_count[token]>=min_freq:
                    self.word2idx[token] = index
                    self.idx2word[index] = token
                    index += 1
        return self

[docs]    def get_word(self, index: int)->str:
        """get the word by index

        Args:
            index: word index

        Returns: 
            word

        """
        
        try:
            return self.idx2word[int(index)]
        except:
            if index == -1:
                return '[unknown]'
            raise KeyError('Undefined index: {}'.format(index))

[docs]    def add(self, word):
        """add one word to vocab

        Args:
            word: single word

        Returns: 
            self

        """
        
        if not self.word_count[word]:
            self.word2idx[word] = self.word_num
            self.idx2word[self.word_num] = word
            self.word_num += 1
        self.word_count[word] += 1
        return self

[docs]    def auto_update(self, data: Union[str, Iterable]):
        """auto detect data type to update the vocab

        Args:
            data:  str| List[str] | Set[str] | List[List[str]]

        Returns: 
            self

        """
        if isinstance(data, str):
            self.add(data)
        elif isinstance(data, list) or isinstance(data, set) or isinstance(data, pd.Series):
            self.add_from_iter(data)
        else:
            raise ValueError("Don't support the type of {}".format(data))
        return self

    def __len__(self):
        """get the token num of vocab
        Returns: 
            len(self.word2idx)

        """
        return len(self.word2idx)

[docs]    def add_from_iter(self, iterator):
        """add the tokens in iterator to vocab

        Args:
            iterator: List[str] | Set[str] | List[List[str]]

        Returns: 
            self

        """
        for word in iterator:
            if isinstance(word, list) or isinstance(word, set):
                self.add_from_iter(word)
            elif isinstance(word, str):
                self.add(word)
            else:
                raise ValueError("Don't support the type of {}".format(word))
        return self