Source code for native_client.ctcdecode

import enum
from collections import namedtuple

from . import swigwrapper  # pylint: disable=import-self

# This module is built with SWIG_PYTHON_STRICT_BYTE_CHAR so we must handle
# string encoding explicitly, here and throughout this file.
__version__ = swigwrapper.__version__.decode("utf-8")

# Hack: import error codes by matching on their names, as SWIG unfortunately
# does not support binding enums to Python in a scoped manner yet.
for symbol in dir(swigwrapper):
    if symbol.startswith("STT_ERR_"):
        globals()[symbol] = getattr(swigwrapper, symbol)


[docs]class Alphabet(swigwrapper.Alphabet): """An Alphabet is a bidirectional map from tokens (eg. characters) to internal integer representations used by the underlying acoustic models and external scorers. It can be created from alphabet configuration file via the constructor, or from a list of tokens via :py:meth:`Alphabet.InitFromLabels`. """ def __init__(self, config_path=None): super(Alphabet, self).__init__() if config_path: err = self.init(config_path.encode("utf-8")) if err != 0: raise ValueError( "Alphabet initialization failed with error code 0x{:X}".format(err) )
[docs] def InitFromLabels(self, data): """ Initialize Alphabet from a list of labels ``data``. Each label gets associated with an integer value corresponding to its position in the list. """ return super(Alphabet, self).InitFromLabels([c.encode("utf-8") for c in data])
[docs] def CanEncodeSingle(self, input): """ Returns true if the single character/output class has a corresponding label in the alphabet. """ return super(Alphabet, self).CanEncodeSingle(input.encode("utf-8"))
[docs] def CanEncode(self, input): """ Returns true if the entire string can be encoded into labels in this alphabet. """ return super(Alphabet, self).CanEncode(input.encode("utf-8"))
[docs] def EncodeSingle(self, input): """ Encode a single character/output class into a label. Character must be in the alphabet, this method will assert that. Use `CanEncodeSingle` to test. """ return super(Alphabet, self).EncodeSingle(input.encode("utf-8"))
[docs] def Encode(self, input): """ Encode a sequence of character/output classes into a sequence of labels. Characters are assumed to always take a single Unicode codepoint. Characters must be in the alphabet, this method will assert that. Use ``CanEncode`` and ``CanEncodeSingle`` to test. """ # Convert SWIG's UnsignedIntVec to a Python list res = super(Alphabet, self).Encode(input.encode("utf-8")) return [el for el in res]
def DecodeSingle(self, input): res = super(Alphabet, self).DecodeSingle(input) return res.decode("utf-8")
[docs] def Decode(self, input): """Decode a sequence of labels into a string.""" res = super(Alphabet, self).Decode(input) return res.decode("utf-8")
[docs]class Scorer(swigwrapper.Scorer): """An external scorer is a data structure composed of a language model built from text data, as well as the vocabulary used in the construction of this language model and additional parameters related to how the decoding process uses the external scorer, such as the language model weight ``alpha`` and the word insertion score ``beta``. :param alpha: Language model weight. :type alpha: float :param beta: Word insertion score. :type beta: float :param scorer_path: Path to load scorer from. :type scorer_path: str :param alphabet: Alphabet object matching the tokens used when creating the external scorer. :type alphabet: Alphabet """ def __init__(self, alpha=None, beta=None, scorer_path=None, alphabet=None): super(Scorer, self).__init__() # Allow bare initialization if alphabet: assert alpha is not None, "alpha parameter is required" assert beta is not None, "beta parameter is required" assert scorer_path, "scorer_path parameter is required" err = self.init_from_filepath(scorer_path.encode("utf-8"), alphabet) if err != 0: raise ValueError( "Scorer initialization failed with error code 0x{:X}".format(err) ) self.reset_params(alpha, beta)
DecodeResult = namedtuple( "DecodeResult", ["confidence", "transcript", "tokens", "timesteps"] )
[docs]def ctc_beam_search_decoder( probs_seq, alphabet, beam_size, cutoff_prob=1.0, cutoff_top_n=40, scorer=None, hot_words=dict(), num_results=1, ): """Wrapper for the CTC Beam Search Decoder. :param probs_seq: 2-D list of probability distributions over each time step, with each element being a list of normalized probabilities over alphabet and blank. :type probs_seq: 2-D list :param alphabet: Alphabet :param beam_size: Width for beam search. :type beam_size: int :param cutoff_prob: Cutoff probability in pruning, default 1.0, no pruning. :type cutoff_prob: float :param cutoff_top_n: Cutoff number in pruning, only top cutoff_top_n characters with highest probs in alphabet will be used in beam search, default 40. :type cutoff_top_n: int :param scorer: External scorer for partially decoded sentence, e.g. word count or language model. :type scorer: Scorer :param hot_words: Map of words (keys) to their assigned boosts (values) :type hot_words: dict[string, float] :param num_results: Number of beams to return. :type num_results: int :return: List of tuples of confidence and sentence as decoding results, in descending order of the confidence. :rtype: list """ beam_results = swigwrapper.ctc_beam_search_decoder( probs_seq, alphabet, beam_size, cutoff_prob, cutoff_top_n, scorer, hot_words, num_results, ) beam_results = [ DecodeResult( res.confidence, alphabet.Decode(res.tokens), [int(t) for t in res.tokens], [int(t) for t in res.timesteps], ) for res in beam_results ] return beam_results
[docs]def ctc_beam_search_decoder_for_wav2vec2am( probs_seq, alphabet, beam_size, cutoff_prob=1.0, cutoff_top_n=40, blank_id=-1, ignored_symbols=frozenset(), scorer=None, hot_words=dict(), num_results=1, ): """Wrapper for the CTC Beam Search Decoder. :param probs_seq: 2-D list of probability distributions over each time step, with each element being a list of normalized probabilities over alphabet and blank. :type probs_seq: 2-D list :param alphabet: Alphabet :param beam_size: Width for beam search. :type beam_size: int :param cutoff_prob: Cutoff probability in pruning, default 1.0, no pruning. :type cutoff_prob: float :param cutoff_top_n: Cutoff number in pruning, only top cutoff_top_n characters with highest probs in alphabet will be used in beam search, default 40. :type cutoff_top_n: int :param scorer: External scorer for partially decoded sentence, e.g. word count or language model. :type scorer: Scorer :param hot_words: Map of words (keys) to their assigned boosts (values) :type hot_words: dict[string, float] :param num_results: Number of beams to return. :type num_results: int :return: List of tuples of confidence and sentence as decoding results, in descending order of the confidence. :rtype: list """ beam_results = swigwrapper.ctc_beam_search_decoder_for_wav2vec2am( probs_seq, alphabet, beam_size, cutoff_prob, cutoff_top_n, blank_id, ignored_symbols, scorer, hot_words, num_results, ) beam_results = [ DecodeResult( res.confidence, alphabet.Decode(res.tokens), [int(t) for t in res.tokens], [int(t) for t in res.timesteps], ) for res in beam_results ] return beam_results
[docs]def ctc_beam_search_decoder_batch( probs_seq, seq_lengths, alphabet, beam_size, num_processes, cutoff_prob=1.0, cutoff_top_n=40, scorer=None, hot_words=dict(), num_results=1, ): """Wrapper for the batched CTC beam search decoder. :param probs_seq: 3-D list with each element as an instance of 2-D list of probabilities used by ctc_beam_search_decoder(). :type probs_seq: 3-D list :param alphabet: alphabet list. :alphabet: Alphabet :param beam_size: Width for beam search. :type beam_size: int :param num_processes: Number of parallel processes. :type num_processes: int :param cutoff_prob: Cutoff probability in alphabet pruning, default 1.0, no pruning. :type cutoff_prob: float :param cutoff_top_n: Cutoff number in pruning, only top cutoff_top_n characters with highest probs in alphabet will be used in beam search, default 40. :type cutoff_top_n: int :param num_processes: Number of parallel processes. :type num_processes: int :param scorer: External scorer for partially decoded sentence, e.g. word count or language model. :type scorer: Scorer :param hot_words: Map of words (keys) to their assigned boosts (values) :type hot_words: dict[string, float] :param num_results: Number of beams to return. :type num_results: int :return: List of tuples of confidence and sentence as decoding results, in descending order of the confidence. :rtype: list """ batch_beam_results = swigwrapper.ctc_beam_search_decoder_batch( probs_seq, seq_lengths, alphabet, beam_size, num_processes, cutoff_prob, cutoff_top_n, scorer, hot_words, num_results, ) batch_beam_results = [ [ DecodeResult( res.confidence, alphabet.Decode(res.tokens), [int(t) for t in res.tokens], [int(t) for t in res.timesteps], ) for res in beam_results ] for beam_results in batch_beam_results ] return batch_beam_results
[docs]def ctc_beam_search_decoder_for_wav2vec2am_batch( probs_seq, seq_lengths, alphabet, beam_size, num_threads, cutoff_prob=1.0, cutoff_top_n=40, blank_id=-1, ignored_symbols=frozenset(), scorer=None, hot_words=dict(), num_results=1, ): """Wrapper for the batched CTC beam search decoder for wav2vec2 AM. :param probs_seq: 3-D list with each element as an instance of 2-D list of probabilities used by ctc_beam_search_decoder(). :type probs_seq: 3-D list :param alphabet: alphabet list. :alphabet: Alphabet :param beam_size: Width for beam search. :type beam_size: int :param num_threads: Number of threads to use for processing batch. :type num_threads: int :param cutoff_prob: Cutoff probability in alphabet pruning, default 1.0, no pruning. :type cutoff_prob: float :param cutoff_top_n: Cutoff number in pruning, only top cutoff_top_n characters with highest probs in alphabet will be used in beam search, default 40. :type cutoff_top_n: int :param scorer: External scorer for partially decoded sentence, e.g. word count or language model. :type scorer: Scorer :param hot_words: Map of words (keys) to their assigned boosts (values) :type hot_words: dict[string, float] :param num_results: Number of beams to return. :type num_results: int :return: List of tuples of confidence and sentence as decoding results, in descending order of the confidence. :rtype: list """ batch_beam_results = swigwrapper.ctc_beam_search_decoder_for_wav2vec2am_batch( probs_seq, seq_lengths, alphabet, beam_size, num_threads, cutoff_prob, cutoff_top_n, blank_id, ignored_symbols, scorer, hot_words, num_results, ) batch_beam_results = [ [ DecodeResult( res.confidence, alphabet.Decode(res.tokens), [int(t) for t in res.tokens], [int(t) for t in res.timesteps], ) for res in beam_results ] for beam_results in batch_beam_results ] return batch_beam_results
[docs]class FlashlightDecoderState(swigwrapper.FlashlightDecoderState): """ This class contains constants used to specify the desired behavior for the :py:func:`flashlight_beam_search_decoder` and :py:func:`flashlight_beam_search_decoder_batch` functions. """
[docs] class CriterionType(enum.IntEnum): """Constants used to specify which loss criterion was used by the acoustic model. This class is a Python :py:class:`enum.IntEnum`. """ #: Decoder mode for handling acoustic models trained with CTC loss CTC = swigwrapper.FlashlightDecoderState.CTC #: Decoder mode for handling acoustic models trained with ASG loss ASG = swigwrapper.FlashlightDecoderState.ASG #: Decoder mode for handling acoustic models trained with Seq2seq loss #: Note: this criterion type is currently not supported. S2S = swigwrapper.FlashlightDecoderState.S2S
[docs] class DecoderType(enum.IntEnum): """Constants used to specify if decoder should operate in lexicon mode, only predicting words present in a fixed vocabulary, or in lexicon-free mode, without such restriction. This class is a Python :py:class:`enum.IntEnum`. """ #: Lexicon mode, only predict words in specified vocabulary. LexiconBased = swigwrapper.FlashlightDecoderState.LexiconBased #: Lexicon-free mode, allow prediction of any word. LexiconFree = swigwrapper.FlashlightDecoderState.LexiconFree
[docs] class TokenType(enum.IntEnum): """Constants used to specify the granularity of text units used when training the external scorer in relation to the text units used when training the acoustic model. For example, you can have an acoustic model predicting characters and an external scorer trained on words, or an acoustic model and an external scorer both trained with sub-word units. If the acoustic model and the scorer were both trained on the same text unit granularity, use ``TokenType.Single``. Otherwise, if the external scorer was trained on a sequence of acoustic model text units, use ``TokenType.Aggregate``. This class is a Python :py:class:`enum.IntEnum`. """ #: Token type for external scorers trained on the same textual units as #: the acoustic model. Single = swigwrapper.FlashlightDecoderState.Single #: Token type for external scorers trained on a sequence of acoustic model #: textual units. Aggregate = swigwrapper.FlashlightDecoderState.Aggregate
[docs]def flashlight_beam_search_decoder( logits_seq, alphabet, beam_size, decoder_type, token_type, lm_tokens, scorer=None, beam_threshold=25.0, cutoff_top_n=40, silence_score=0.0, merge_with_log_add=False, criterion_type=FlashlightDecoderState.CriterionType.CTC, transitions=[], num_results=1, ): """Decode acoustic model emissions for a single sample. Note that unlike :py:func:`ctc_beam_search_decoder`, this function expects raw outputs from CTC and ASG acoustic models, without softmaxing them over timesteps. :param logits_seq: 2-D list of acoustic model emissions, dimensions are time steps x number of output units. :type logits_seq: 2-D list of floats or numpy array :param alphabet: Alphabet object matching the tokens used when creating the acoustic model and external scorer if specified. :type alphabet: Alphabet :param beam_size: Width for beam search. :type beam_size: int :param decoder_type: Decoding mode, lexicon-constrained or lexicon-free. :type decoder_type: FlashlightDecoderState.DecoderType :param token_type: Type of token in the external scorer. :type token_type: FlashlightDecoderState.TokenType :param lm_tokens: List of tokens to constrain decoding to when in lexicon-constrained mode. Must match the token type used in the scorer, ie. must be a list of characters if scorer is character-based, or a list of words if scorer is word-based. :param lm_tokens: list[str] :param scorer: External scorer. :type scorer: Scorer :param beam_threshold: Maximum threshold in beam score from leading beam. Any newly created candidate beams which lag behind the best beam so far by more than this value will get pruned. This is a performance optimization parameter and an appropriate value should be found empirically using a validation set. :type beam_threshold: float :param cutoff_top_n: Maximum number of tokens to expand per time step during decoding. Only the highest probability cutoff_top_n candidates (characters, sub-word units, words) in a given timestep will be expanded. This is a performance optimization parameter and an appropriate value should be found empirically using a validation set. :type cutoff_top_n: int :param silence_score: Score to add to beam when encountering a predicted silence token (eg. the space symbol). :type silence_score: float :param merge_with_log_add: Whether to use log-add when merging scores of new candidate beams equivalent to existing ones (leading to the same transcription). When disabled, the maximum score is used. :type merge_with_log_add: bool :param criterion_type: Criterion used for training the acoustic model. :type criterion_type: FlashlightDecoderState.CriterionType :param transitions: Transition score matrix for ASG acoustic models. :type transitions: list[float] :param num_results: Number of beams to return. :type num_results: int :return: List of FlashlightOutput structures. :rtype: list[FlashlightOutput] """ return swigwrapper.flashlight_beam_search_decoder( logits_seq, alphabet, beam_size, beam_threshold, cutoff_top_n, scorer, token_type, lm_tokens, decoder_type, silence_score, merge_with_log_add, criterion_type, transitions, num_results, )
[docs]def flashlight_beam_search_decoder_batch( probs_seq, seq_lengths, alphabet, beam_size, decoder_type, token_type, lm_tokens, num_processes, scorer=None, beam_threshold=25.0, cutoff_top_n=40, silence_score=0.0, merge_with_log_add=False, criterion_type=FlashlightDecoderState.CriterionType.CTC, transitions=[], num_results=1, ): """Decode batch acoustic model emissions in parallel. ``num_processes`` controls how many samples from the batch will be decoded simultaneously. All the other parameters are forwarded to :py:func:`flashlight_beam_search_decoder`. Returns a list of lists of FlashlightOutput structures. """ return swigwrapper.flashlight_beam_search_decoder_batch( probs_seq, seq_lengths, alphabet, beam_size, beam_threshold, cutoff_top_n, scorer, token_type, lm_tokens, decoder_type, silence_score, merge_with_log_add, criterion_type, transitions, num_results, num_processes, )
[docs]class UTF8Alphabet(swigwrapper.UTF8Alphabet): """Alphabet class representing 255 possible byte values for Bytes Output Mode. For internal use only. """ def __init__(self): super(UTF8Alphabet, self).__init__() err = self.init(b"") if err != 0: raise ValueError( "UTF8Alphabet initialization failed with error code 0x{:X}".format(err) )
[docs] def CanEncodeSingle(self, input): """ Returns true if the single character/output class has a corresponding label in the alphabet. """ return super(UTF8Alphabet, self).CanEncodeSingle(input.encode("utf-8"))
[docs] def CanEncode(self, input): """ Returns true if the entire string can be encoded into labels in this alphabet. """ return super(UTF8Alphabet, self).CanEncode(input.encode("utf-8"))
[docs] def EncodeSingle(self, input): """ Encode a single character/output class into a label. Character must be in the alphabet, this method will assert that. Use ``CanEncodeSingle`` to test. """ return super(UTF8Alphabet, self).EncodeSingle(input.encode("utf-8"))
[docs] def Encode(self, input): """ Encode a sequence of character/output classes into a sequence of labels. Characters are assumed to always take a single Unicode codepoint. Characters must be in the alphabet, this method will assert that. Use ``CanEncode`` and ``CanEncodeSingle`` to test. """ # Convert SWIG's UnsignedIntVec to a Python list res = super(UTF8Alphabet, self).Encode(input.encode("utf-8")) return [el for el in res]
def DecodeSingle(self, input): res = super(UTF8Alphabet, self).DecodeSingle(input) return res.decode("utf-8")
[docs] def Decode(self, input): """Decode a sequence of labels into a string.""" res = super(UTF8Alphabet, self).Decode(input) return res.decode("utf-8")