Source code for cornac.data.graph

# Copyright 2018 The Cornac Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ============================================================================

import scipy.sparse as sp
import numpy as np
from tqdm.auto import trange

from . import FeatureModality



[docs]
class GraphModality(FeatureModality):
    """Graph modality

    Parameters
    ----------
    data: List[str], required
        A list encoding an adjacency matrix, of a user or an item graph, in the sparse triplet format, \
        e.g., data=[('user1', 'user4', 1.0)].
    """

    def __init__(self, **kwargs):
        super().__init__(**kwargs)
        self.raw_data = kwargs.get("data", None)
        self.__matrix = None
        self.__matrix_size = None

    @property
    def matrix(self):
        """Return the adjacency matrix in scipy csr sparse format
        """
        if self.__matrix is None:
            assert self.__matrix_size is not None
            self.__matrix = sp.csr_matrix(
                (self.val, (self.map_rid, self.map_cid)),
                shape=(self.__matrix_size, self.__matrix_size),
            )
        return self.__matrix

    def _build_triplet(self, id_map):
        """Build adjacency matrix in sparse triplet format using cornac's mapped ids
        """
        self.map_rid = []
        self.map_cid = []
        self.val = []
        for i, j, v in self.raw_data:
            if (i not in id_map) or (j not in id_map):
                continue
            self.map_rid.append(id_map[i])
            self.map_cid.append(id_map[j])
            self.val.append(v)

        self.map_rid = np.asarray(self.map_rid, dtype='int')
        self.map_cid = np.asarray(self.map_cid, dtype='int')
        self.val = np.asarray(self.val, dtype='float')


[docs]
    def build(self, id_map=None, **kwargs):
        super().build(id_map=id_map)

        self.__matrix = None
        if id_map is not None:
            self.__matrix_size = int(max(id_map.values()) + 1)
            self._build_triplet(id_map)
        return self



[docs]
    def get_train_triplet(self, train_row_ids, train_col_ids):
        """Get the subset of relations which align with the training data

        Parameters
        ----------
        train_row_ids: array, required
            An array containing the ids of training objects (users or items) for which to get the "out" relations. \

        train_col_ids: array, required
            An array containing the ids of training objects (users or items) for whom to get the "in" relations.
            Please refer to cornac/models/c2pf/recom_c2pf.py for a concrete usage example of this function.

        Returns
        -------
        A subset of the adjacency matrix, in the sparse triplet format, whose elements align with the training \
        set as specified by "train_row_ids" and "train_col_ids".
        """
        picked_idx = []
        train_row_ids = (
            set(train_row_ids) if not isinstance(train_row_ids, set) else train_row_ids
        )
        train_col_ids = (
            set(train_col_ids) if not isinstance(train_col_ids, set) else train_col_ids
        )
        for idx, (i, j) in enumerate(zip(self.map_rid, self.map_cid)):
            if (i not in train_row_ids) or (j not in train_col_ids):
                continue
            picked_idx.append(idx)

        return self.map_rid[picked_idx], self.map_cid[picked_idx], self.val[picked_idx]



[docs]
    def get_node_degree(self, in_ids=None, out_ids=None):
        """Get the "in" and "out" degree for the desired set of nodes

        Parameters
        ----------
        in_ids: array, required
            An array containing the ids for which to get the "in" degree. \

        out_ids: array, required
            An array containing the ids for which to get the "out" degree. \

        Returns
        -------
        Dictionary of the from {node_id: [in_degree,out_degree]}
        """

        degree = {}

        if in_ids is None:
            in_ids = self.map_cid
        if out_ids is None:
            out_ids = self.map_rid

        in_ids = set(in_ids) if not isinstance(in_ids, set) else in_ids
        out_ids = set(out_ids) if not isinstance(out_ids, set) else out_ids
        for (i, j) in zip(self.map_rid, self.map_cid):
            if (i not in out_ids) or (j not in in_ids):
                continue
            degree[i] = degree.get(i, np.asarray([0, 0])) + np.asarray([0, 1])
            degree[j] = degree.get(j, np.asarray([0, 0])) + np.asarray([1, 0])
        return degree


    # TODO: add feature_fallback decorator and rename the API more meaningful

[docs]
    def batch(self, batch_ids):
        """Return batch of vectors from the sparse adjacency matrix corresponding to provided batch_ids.

        Parameters
        ----------
        batch_ids: array, required
            An array containing the ids of rows to be returned from the sparse adjacency matrix.
        """

        return self.matrix[batch_ids]


    @staticmethod
    def _to_triplet(mat, ids=None):
        """Covert a 2d array into sparse triplet format.

        Parameters
        ----------
        mat: 2d array, required
            A Numpy 2d array of integers.
        ids: list, optional, default: None
            A list of ids (or labels) of the objects to be used in the output triplet matrix.

        Returns
        -------
        A set corresponding to the sparse triplet representation of mat.
        """
        tuples = set()
        n = mat.shape[0]
        k = mat.shape[1]

        if ids is None:
            ids = range(n)
        for n_ in range(n):
            for k_ in range(k):
                j = int(mat[n_, k_])
                tuples.add((ids[n_], ids[j], 1.0))

        return tuples

    @staticmethod
    def _to_symmetric(triplets):
        """ Transform an asymmetric adjacency matrix to a symmetric one.

        Parameters
        ----------
        triplets: Python set, required
            A Python set representing an adjacency matrix in the sparse triplet format.

        Returns
        -------
        Python set representing a symmetric adjacency matrix.
        """
        triplets.update([(j, i, v) for (i, j, v) in triplets])
        return triplets

    @staticmethod
    def _build_knn(features, k=5, similarity="cosine", verbose=True):
        """Build a KNN graph of a set of objects using similarities among there features.

        Parameters
        ----------
        features: 2d array, required
            A 2d Numpy array of features (object-by-features).
        k: int, optional, default: 5
            The number of nearest neighbors
        similarity: string, optional, default: "cosine"
            The similarity measure. At this time only the cosine is supported

        Returns
        -------
        graph_modality: :obj:`<cornac.data.GraphModality>`
            GraphModality object.
        """

        # Some util variables
        n = len(features)
        N = np.zeros((n, k))

        if similarity == "cosine":
            # Normalize features to lie on a unit hypersphere
            l2_norm = np.linalg.norm(features, 2, axis=1, keepdims=True)
            features = features / (l2_norm + 1e-20)

        for i in trange(n, desc="Building KNN graph", disable=not verbose):
            sim = features.dot(features[i])
            sim[i] = -np.inf  # ingore current idx
            k_largest_idx = np.argpartition(sim, -k)[-k:]
            N[i] = k_largest_idx

        return N


[docs]
    @classmethod
    def from_feature(
        cls, features, k=5, ids=None, similarity="cosine", symmetric=False, verbose=True
    ):
        """Instantiate a GraphModality with a KNN graph build using input features.

        Parameters
        ----------
        features: 2d Numpy array, shape: [n_objects, n_features], required
            A 2d Numpy array of features, e.g., visual, textual, etc.

        k: int, optional, default: 5
            The number of nearest neighbors

        ids: array, optional, default: None
            The list of object ids or labels, which align with the rows of features. \
            For instance if you use textual (bag-of-word) features, \
            then "ids" should be the same as the input to cornac.data.TextModality.

        similarity: string, optional, default: "cosine"
            The similarity measure. At this time only the cosine is supported

        symmetric: bool, optional, default: False
            When True the resulting KNN-Graph is made symmetric

        verbose: bool, default: False
            The verbosity flag.

        Returns
        -------
        graph_modality: :obj:`<cornac.data.GraphModality>`
            GraphModality object.
        """
        # build knn graph
        knn_graph_array = GraphModality._build_knn(features, k, similarity, verbose=verbose)
        knn_graph_triplet = GraphModality._to_triplet(mat=knn_graph_array, ids=ids)
        if symmetric:
            if verbose:
                print("Symmetrizing the graph")
            knn_graph_triplet = GraphModality._to_symmetric(knn_graph_triplet)

        return cls(data=knn_graph_triplet)