Source code for sparse_som.classifier

from . import som
from collections import defaultdict, Counter
from operator import itemgetter
import numpy as np


[docs]class SomClassifier:
[docs]    def __init__(self, cls=som.BSom, height=10, width=10, dim=None, **kwargs):
        """\
        :param cls: SOM constructor
        :type cls: :class:`Som` or :class:`BSom`
        :param height: SOM height (default 10)
        :type height: int
        :param width: SOM width (default 10)
        :type width: int
        :param dim: SOM nodes nb of dimensions (if None, detected when :class:`fit` called)
        :type dim: int
        :param \**kwargs: named parameters for the constructor
        """
        # take any positional arguments to named
        kwargs['h'] = height
        kwargs['w'] = width
        if dim is not None:
            kwargs['d'] = dim

        self._cls = cls             # the network constructor
        self._kwargs = kwargs       # named params for constructor
        self._som = None            # the SOM network
        self._bmus = None           # bmus of the training data
        self.classifier = None      # used for network calibration
        self.quant_error = None
        self.topog_error = None

[docs]    def setup(self, **kwargs):
        """\
        Add / Change SOM constructor parameters.
        """
        assert self._som is None, 'cannot setup params after SOM instanciation'
        self._kwargs.update(kwargs)

[docs]    def params(self):
        """\
        Get the SOM constructor parameters.
        """
        return self._kwargs

    @property
    def som(self):
        # lazy constructor
        if self._som is None:
            self._som = self._cls(**self._kwargs)
        return self._som

[docs]    def fit(self, data, labels, **kwargs):
        """\
        Training the SOM on the the data and calibrate itself.

        After the training, `self.quant_error` and `self.topog_error` are
        respectively set.

        :param data: sparse input matrix (ideal dtype is `numpy.float32`)
        :type data: :class:`scipy.sparse.csr_matrix`
        :param labels: the labels associated with data
        :type labels: iterable
        :param \**kwargs: optional parameters for :meth:`train`
        """

        # set vectors dimensions if they're missing
        if 'd' not in self._kwargs:
            self._kwargs['d'] = data.shape[1]

        assert self.som.dim == data.shape[1], 'dimension mismatch'

        # train the network
        self.som.train(data, **kwargs)
        # retrieve first and second bmus and distances
        bmus = self.som.bmus(data, True, True)
        # set errors measures of training data
        self.quant_error = self.som.quantization_error
        self.topog_error = self.som.topographic_error
        # store training bmus
        self._bmus = bmus
        # calibrate
        self._calibrate(data, labels)

    def _calibrate(self, data, labels):
        """\
        Calibrate the network using `self._bmus`.
        """
        # network calibration
        classifier = defaultdict(Counter)
        for (i, j), label in zip(self._bmus, labels):
            classifier[i, j][label] += 1
        self.classifier = {}
        for ij, cnt in classifier.items():
            maxi = max(cnt.items(), key=itemgetter(1))
            nb = sum(cnt.values())
            self.classifier[ij] = maxi[0], maxi[1] / nb

    def _predict_from_bmus(self, bmus, unkown):
        lst = []
        for i, j in bmus:
            cls = self.classifier.get((i, j))
            if cls is None:
                lst.append(unkown)
            else:
                lbl, p = cls
                lst.append(lbl)
        return np.array(lst)

[docs]    def predict(self, data, unkown=None):
        """\
        Classify data according to previous calibration.

        :param data: sparse input matrix (ideal dtype is `numpy.float32`)
        :type data: :class:`scipy.sparse.csr_matrix`
        :param unkown: the label to attribute if no label is known
        :returns: the labels guessed for data
        :rtype: `numpy.array`
        """
        assert self.classifier is not None, 'not calibrated'
        bmus = self.som.bmus(data)
        return self._predict_from_bmus(bmus, unkown)

[docs]    def fit_predict(self, data, labels, unkown=None):
        """\
        Fit and classify data efficiently.

        :param data: sparse input matrix (ideal dtype is `numpy.float32`)
        :type data: :class:`scipy.sparse.csr_matrix`
        :param labels: the labels associated with data
        :type labels: iterable
        :param unkown: the label to attribute if no label is known
        :returns: the labels guessed for data
        :rtype: `numpy.array`
        """
        self.fit(data, labels)
        return self._predict_from_bmus(self._bmus, unkown)

[docs]    def get_precision(self):
        """\
        :returns: the ratio part of the dominant label for each unit.
        :rtype: 2D :class:`numpy.ndarray`
        """
        assert self.classifier is not None, 'not calibrated'
        arr = np.zeros((self.som.nrows, self.som.ncols))
        for ij, (lbl, p) in self.classifier.items():
            arr[ij] = p
        return arr

[docs]    def histogram(self, bmus=None):
        """\
        Return a 2D histogram of bmus.

        :param bmus: the best-match units indexes for underlying data.
        :type bmus: :class:`numpy.ndarray`
        :returns: the computed 2D histogram of bmus.
        :rtype: :class:`numpy.ndarray`
        """
        if bmus is None:
            assert self._bmus is not None, 'not trained'
            bmus = self._bmus
        arr = np.zeros((self.som.nrows, self.som.ncols))
        for i, j in bmus:
            arr[i, j] += 1
        return arr

    def __repr__(self):
        return '<SomClassifier: ' + ', '.join(map('%s=%s'.__mod__, self._kwargs.items())) + '>'