Source code for cornac.eval_methods.cross_validation

# Copyright 2018 The Cornac Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ============================================================================

import numpy as np

from ..utils.common import safe_indexing
from ..experiment.result import CVResult
from ..utils import get_rng
from ..data import Dataset
from .base_method import BaseMethod


[docs] class CrossValidation(BaseMethod): """Cross Validation Evaluation Method. Parameters ---------- data: array-like, required Raw preference data in the triplet format [(user_id, item_id, rating_value)]. n_folds: int, optional, default: 5 The number of folds for cross validation. rating_threshold: float, optional, default: 1.0 Threshold used to binarize rating values into positive or negative feedback for model evaluation using ranking metrics (rating metrics are not affected). partition: array-like, shape (n_observed_ratings,), optional, default: None The partition of ratings into n_folds (fold label of each rating) \ If `None`, random partitioning is performed to assign each rating into a fold. seed: int, optional, default: None Random seed for reproducibility. exclude_unknowns: bool, optional, default: True If `True`, unknown users and items will be ignored during model evaluation. verbose: bool, optional, default: False Output running log. """ def __init__( self, data, n_folds=5, rating_threshold=1.0, partition=None, seed=None, exclude_unknowns=True, verbose=False, **kwargs ): BaseMethod.__init__( self, data=data, rating_threshold=rating_threshold, seed=seed, exclude_unknowns=exclude_unknowns, verbose=verbose, **kwargs ) self.n_folds = n_folds self.n_ratings = len(self.data) self.current_fold = 0 self.current_split = None self._partition = self._validate_partition(partition) def _partition_data(self): """Partition ratings into n_folds""" fold_size = int(self.n_ratings / self.n_folds) remain_size = self.n_ratings - fold_size * self.n_folds partition = np.repeat(np.arange(self.n_folds), fold_size) self.rng.shuffle(partition) if remain_size > 0: remain_partition = self.rng.choice( self.n_folds, size=remain_size, replace=True, p=None ) partition = np.concatenate((partition, remain_partition)) return partition def _validate_partition(self, partition): if partition is None: return self._partition_data() elif len(partition) != self.n_ratings: raise ValueError( "The partition length must be equal to the number of ratings" ) elif len(set(partition)) != self.n_folds: raise ValueError( "Number of folds in given partition different from %s" % (self.n_folds) ) return partition def _get_train_test(self): if self.verbose: print("Fold: {}".format(self.current_fold + 1)) test_idx = np.where(self._partition == self.current_fold)[0] train_idx = np.where(self._partition != self.current_fold)[0] train_data = safe_indexing(self.data, train_idx) test_data = safe_indexing(self.data, test_idx) self.build(train_data=train_data, test_data=test_data, val_data=test_data) def _next_fold(self): if self.current_fold < self.n_folds - 1: self.current_fold = self.current_fold + 1 else: self.current_fold = 0
[docs] def evaluate(self, model, metrics, user_based, show_validation): result = CVResult(model.name) for _ in range(self.n_folds): self._get_train_test() new_model = model.clone() # clone a completely new model fold_result, _ = BaseMethod.evaluate( self, new_model, metrics, user_based, show_validation=False ) result.append(fold_result) self._next_fold() result.organize() return result, None # no validation result of CV