Source code for cornac.models.knn.recom_knn

# Copyright 2018 The Cornac Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ============================================================================

import multiprocessing

import numpy as np
from scipy.sparse import csr_matrix, coo_matrix

from ..recommender import Recommender
from ...exception import ScoreException
from ...utils import get_rng
from ...utils.fast_sparse_funcs import inplace_csr_row_normalize_l2
from .similarity import compute_similarity, compute_score, compute_score_single


EPS = 1e-8

SIMILARITIES = ["cosine", "pearson"]
WEIGHTING_OPTIONS = ["idf", "bm25"]


def _mean_centered(ui_mat):
    """Subtract every rating values with mean value of the corresponding rows"""
    mean_arr = np.zeros(ui_mat.shape[0])
    for i in range(ui_mat.shape[0]):
        start_idx, end_idx = ui_mat.indptr[i : i + 2]
        mean_arr[i] = np.mean(ui_mat.data[start_idx:end_idx])
        row_data = ui_mat.data[start_idx:end_idx]
        row_data -= mean_arr[i]
        row_data[row_data == 0] = EPS
        ui_mat.data[start_idx:end_idx] = row_data

    return ui_mat, mean_arr


def _amplify(ui_mat, alpha=1.0):
    """Exponentially amplify values of similarity matrix"""
    if alpha == 1.0:
        return ui_mat

    for i, w in enumerate(ui_mat.data):
        ui_mat.data[i] = w**alpha if w > 0 else -((-w) ** alpha)
    return ui_mat


def _idf_weight(ui_mat):
    """Weight the matrix Inverse Document (Item) Frequency"""
    X = coo_matrix(ui_mat)

    # calculate IDF
    N = float(X.shape[0])
    idf = np.log(N / np.bincount(X.col))

    weights = idf[ui_mat.indices] + EPS
    return weights


def _bm25_weight(ui_mat):
    """Weight the matrix with BM25 algorithm"""
    K1 = 1.2
    B = 0.8

    X = coo_matrix(ui_mat)
    X.data = np.ones_like(X.data)

    N = float(X.shape[0])
    idf = np.log(N / np.bincount(X.col))

    # calculate length_norm per document (user)
    row_sums = np.ravel(X.sum(axis=1))
    average_length = row_sums.mean()
    length_norm = (1.0 - B) + B * row_sums / average_length

    # bm25 weights
    weights = (K1 + 1.0) / (K1 * length_norm[X.row] + X.data) * idf[X.col] + EPS
    return weights



[docs]
class UserKNN(Recommender):
    """User-Based Nearest Neighbor.

    Parameters
    ----------
    name: string, default: 'UserKNN'
        The name of the recommender model.

    k: int, optional, default: 20
        The number of nearest neighbors.

    similarity: str, optional, default: 'cosine'
        The similarity measurement. Supported types: ['cosine', 'pearson']

    mean_centered: bool, optional, default: False
        Whether values of the user-item rating matrix will be centered by the mean
        of their corresponding rows (mean rating of each user).

    weighting: str, optional, default: None
        The option for re-weighting the rating matrix. Supported types: ['idf', 'bm25'].
        If None, no weighting is applied.

    amplify: float, optional, default: 1.0
        Amplifying the influence on similarity weights.

    num_threads: int, optional, default: 0
        Number of parallel threads for training. If num_threads=0, all CPU cores will be utilized.
        If seed is not None, num_threads=1 to remove randomness from parallelization.

    seed: int, optional, default: None
        Random seed for weight initialization.

    References
    ----------
    * CarlKadie, J. B. D. (1998). Empirical analysis of predictive algorithms for collaborative filtering. Microsoft Research Microsoft Corporation One Microsoft Way Redmond, WA, 98052.
    * Aggarwal, C. C. (2016). Recommender systems (Vol. 1). Cham: Springer International Publishing.
    """

    def __init__(
        self,
        name="UserKNN",
        k=20,
        similarity="cosine",
        mean_centered=False,
        weighting=None,
        amplify=1.0,
        num_threads=0,
        trainable=True,
        verbose=True,
        seed=None,
    ):
        super().__init__(name=name, trainable=trainable, verbose=verbose)
        self.k = k
        self.similarity = similarity
        self.mean_centered = mean_centered
        self.weighting = weighting
        self.amplify = amplify
        self.seed = seed
        self.rng = get_rng(seed)

        if self.similarity not in SIMILARITIES:
            raise ValueError(
                "Invalid similarity choice, supported {}".format(SIMILARITIES)
            )

        if self.weighting is not None and self.weighting not in WEIGHTING_OPTIONS:
            raise ValueError(
                "Invalid weighting choice, supported {}".format(WEIGHTING_OPTIONS)
            )

        if seed is not None:
            self.num_threads = 1
        elif num_threads > 0 and num_threads < multiprocessing.cpu_count():
            self.num_threads = num_threads
        else:
            self.num_threads = multiprocessing.cpu_count()


[docs]
    def fit(self, train_set, val_set=None):
        """Fit the model to observations.

        Parameters
        ----------
        train_set: :obj:`cornac.data.Dataset`, required
            User-Item preference data as well as additional modalities.

        val_set: :obj:`cornac.data.Dataset`, optional, default: None
            User-Item preference data for model selection purposes (e.g., early stopping).

        Returns
        -------
        self : object
        """
        Recommender.fit(self, train_set, val_set)

        self.ui_mat = train_set.matrix.copy()
        self.mean_arr = np.zeros(self.ui_mat.shape[0])
        if self.min_rating != self.max_rating:  # explicit feedback
            self.ui_mat, self.mean_arr = _mean_centered(self.ui_mat)

        if self.mean_centered or self.similarity == "pearson":
            weight_mat = self.ui_mat.copy()
        else:
            weight_mat = train_set.matrix.copy()

        # re-weighting
        if self.weighting == "idf":
            weight_mat.data *= np.sqrt(_idf_weight(train_set.matrix))
        elif self.weighting == "bm25":
            weight_mat.data *= np.sqrt(_bm25_weight(train_set.matrix))

        # only need item-user matrix for prediction
        self.iu_mat = self.ui_mat.T.tocsr()
        del self.ui_mat

        self.sim_mat = compute_similarity(
            weight_mat, k=self.k, num_threads=self.num_threads, verbose=self.verbose
        )
        self.sim_mat = _amplify(self.sim_mat, self.amplify)

        return self



[docs]
    def score(self, user_idx, item_idx=None):
        """Predict the scores/ratings of a user for an item.

        Parameters
        ----------
        user_idx: int, required
            The index of the user for whom to perform score prediction.

        item_idx: int, optional, default: None
            The index of the item for which to perform score prediction.
            If None, scores for all known items will be returned.

        Returns
        -------
        res : A scalar or a Numpy array
            Relative scores that the user gives to the item or to all known items
        """
        if not self.knows_user(user_idx):
            raise ScoreException(
                "Can't make score prediction for (user_id=%d)" % user_idx
            )

        if item_idx is not None and not self.knows_item(item_idx):
            raise ScoreException(
                "Can't make score prediction for (item_id=%d)" % item_idx
            )

        if item_idx is not None:
            weighted_avg = compute_score_single(
                True,
                self.sim_mat[user_idx].toarray().ravel(),
                self.iu_mat.indptr[item_idx],
                self.iu_mat.indptr[item_idx + 1],
                self.iu_mat.indices,
                self.iu_mat.data,
                k=self.k,
            )
            return self.mean_arr[user_idx] + weighted_avg

        weighted_avg = np.zeros(self.num_items)
        compute_score(
            True,
            self.sim_mat[user_idx].toarray().ravel(),
            self.iu_mat.indptr,
            self.iu_mat.indices,
            self.iu_mat.data,
            k=self.k,
            num_threads=self.num_threads,
            output=weighted_avg,
        )
        known_item_scores = self.mean_arr[user_idx] + weighted_avg

        return known_item_scores





[docs]
class ItemKNN(Recommender):
    """Item-Based Nearest Neighbor.

    Parameters
    ----------
    name: string, default: 'ItemKNN'
        The name of the recommender model.

    k: int, optional, default: 20
        The number of nearest neighbors.

    similarity: str, optional, default: 'cosine'
        The similarity measurement. Supported types: ['cosine', 'pearson']

    mean_centered: bool, optional, default: False
        Whether values of the user-item rating matrix will be centered by the mean
        of their corresponding rows (mean rating of each user).

    weighting: str, optional, default: None
        The option for re-weighting the rating matrix. Supported types: ['idf', 'bm25'].
        If None, no weighting is applied.

    amplify: float, optional, default: 1.0
        Amplifying the influence on similarity weights.

    num_threads: int, optional, default: 0
        Number of parallel threads for training. If num_threads=0, all CPU cores will be utilized.
        If seed is not None, num_threads=1 to remove randomness from parallelization.

    seed: int, optional, default: None
        Random seed for weight initialization.

    References
    ----------
    * Sarwar, B., Karypis, G., Konstan, J., & Riedl, J. (2001, April). Item-based collaborative filtering recommendation algorithms. In Proceedings of the 10th international conference on World Wide Web (pp. 285-295).
    * Aggarwal, C. C. (2016). Recommender systems (Vol. 1). Cham: Springer International Publishing.
    """

    def __init__(
        self,
        name="ItemKNN",
        k=20,
        similarity="cosine",
        mean_centered=False,
        weighting=None,
        amplify=1.0,
        num_threads=0,
        trainable=True,
        verbose=True,
        seed=None,
    ):
        super().__init__(name=name, trainable=trainable, verbose=verbose)
        self.k = k
        self.similarity = similarity
        self.mean_centered = mean_centered
        self.weighting = weighting
        self.amplify = amplify
        self.seed = seed
        self.rng = get_rng(seed)

        if self.similarity not in SIMILARITIES:
            raise ValueError(
                "Invalid similarity choice, supported {}".format(SIMILARITIES)
            )

        if self.weighting is not None and self.weighting not in WEIGHTING_OPTIONS:
            raise ValueError(
                "Invalid weighting choice, supported {}".format(WEIGHTING_OPTIONS)
            )

        if seed is not None:
            self.num_threads = 1
        elif num_threads > 0 and num_threads < multiprocessing.cpu_count():
            self.num_threads = num_threads
        else:
            self.num_threads = multiprocessing.cpu_count()


[docs]
    def fit(self, train_set, val_set=None):
        """Fit the model to observations.

        Parameters
        ----------
        train_set: :obj:`cornac.data.Dataset`, required
            User-Item preference data as well as additional modalities.

        val_set: :obj:`cornac.data.Dataset`, optional, default: None
            User-Item preference data for model selection purposes (e.g., early stopping).

        Returns
        -------
        self : object
        """
        Recommender.fit(self, train_set, val_set)

        self.ui_mat = train_set.matrix.copy()
        self.mean_arr = np.zeros(self.ui_mat.shape[0])
        if self.min_rating != self.max_rating:  # explicit feedback
            self.ui_mat, self.mean_arr = _mean_centered(self.ui_mat)

        if self.mean_centered:
            weight_mat = self.ui_mat.copy()
        else:
            weight_mat = train_set.matrix.copy()

        if self.similarity == "pearson":  # centered by columns
            weight_mat, _ = _mean_centered(weight_mat.T.tocsr())
            weight_mat = weight_mat.T.tocsr()

        # re-weighting
        if self.weighting == "idf":
            weight_mat.data *= np.sqrt(_idf_weight(train_set.matrix))
        elif self.weighting == "bm25":
            weight_mat.data *= np.sqrt(_bm25_weight(train_set.matrix))

        weight_mat = weight_mat.T.tocsr()
        self.sim_mat = compute_similarity(
            weight_mat, k=self.k, num_threads=self.num_threads, verbose=self.verbose
        )
        self.sim_mat = _amplify(self.sim_mat, self.amplify)

        return self



[docs]
    def score(self, user_idx, item_idx=None):
        """Predict the scores/ratings of a user for an item.

        Parameters
        ----------
        user_idx: int, required
            The index of the user for whom to perform score prediction.

        item_idx: int, optional, default: None
            The index of the item for which to perform score prediction.
            If None, scores for all known items will be returned.

        Returns
        -------
        res : A scalar or a Numpy array
            Relative scores that the user gives to the item or to all known items
        """
        if self.is_unknown_user(user_idx):
            raise ScoreException("Can't make score prediction for user %d" % user_idx)

        if item_idx is not None and self.is_unknown_item(item_idx):
            raise ScoreException("Can't make score prediction for item %d" % item_idx)

        if item_idx is not None:
            weighted_avg = compute_score_single(
                False,
                self.ui_mat[user_idx].toarray().ravel(),
                self.sim_mat.indptr[item_idx],
                self.sim_mat.indptr[item_idx + 1],
                self.sim_mat.indices,
                self.sim_mat.data,
                k=self.k,
            )
            return self.mean_arr[user_idx] + weighted_avg

        weighted_avg = np.zeros(self.num_items)
        compute_score(
            False,
            self.ui_mat[user_idx].toarray().ravel(),
            self.sim_mat.indptr,
            self.sim_mat.indices,
            self.sim_mat.data,
            k=self.k,
            num_threads=self.num_threads,
            output=weighted_avg,
        )
        return self.mean_arr[user_idx] + weighted_avg