Source code for myfm.utils.encoders.multi_value

from typing import Iterable

import scipy.sparse as sps
from typing_extensions import Literal

from .categorical import CategoryValueToSparseEncoder


[docs]class MultipleValuesToSparseEncoder(CategoryValueToSparseEncoder[str]):
    """The class to N-hot encode a List of items into a sparse matrix representation."""

[docs]    def __init__(
        self,
        items: Iterable[str],
        min_freq: int = 1,
        sep: str = ",",
        normalize: bool = True,
        handle_unknown: Literal["create", "ignore", "raise"] = "create",
    ):
        """Construct the encoder by providing a list of strings,
        each of which is a list of strings concatenated by `sep`.

        Parameters
        ----------
        items : Iterable[str]
            Iterable of strings, each of which is a concatenated list of possibly multiple items.
        min_freq : int, optional
            The minimal frequency for an item to be retained in the known items list, by default 1.
        sep: str, optional
            Tells how to separate string back into a list. Defaults to `','`.
        normalize: bool, optional
            If `True`, non-zero entry in the encoded matrix will have `1 / N ** 0.5`,
            where `N` is the number of non-zero entries in that row. Defaults to `True`.
        handle_unknown: Literal["create", "ignore", "raise"], optional
            How to handle previously unseen values during encoding.
            If "create", then there is a single category named "__UNK__" for unknown values,
            ant it is treated as 0th category.
            If "ignore", such an item will be ignored.
            If "raise", a `KeyError` is raised.
            Defaults to "create".
        """
        items_flatten = [
            y for x in items for y in set(x.split(sep)) if y
        ]  # ignore empty string.
        self.sep = sep
        self.normalize = normalize
        super().__init__(
            items_flatten, min_freq=min_freq, handle_unknown=handle_unknown
        )

    def to_sparse(self, items: Iterable[str]) -> sps.csr_matrix:
        indptr = [0]
        indices = []
        data = []
        n_row = 0
        cursor = 0
        for row in items:
            n_row += 1
            items = row.split(self.sep)
            indices_local = sorted(
                list(
                    {
                        index
                        for index in [self._get_index(v) for v in items if v]
                        if index is not None
                    }
                )
            )

            if not indices_local:
                indptr.append(cursor)
                continue
            n = len(indices_local)
            value = 1.0 / (float(n) ** 0.5) if self.normalize else 1.0
            indices.extend(indices_local)
            data.extend([value] * n)
            cursor += n
            indptr.append(cursor)
        return sps.csr_matrix(
            (data, indices, indptr),
            shape=(n_row, len(self)),
        )