# Copyright 2018 The Cornac Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ============================================================================
import copy
import os
import pickle
import warnings
from collections import Counter, OrderedDict, defaultdict
import numpy as np
from scipy.sparse import csc_matrix, csr_matrix, dok_matrix
from ..utils import estimate_batches, get_rng, validate_format
[docs]
class Dataset(object):
"""Training set contains preference matrix
Parameters
----------
num_users: int, required
Number of users.
num_items: int, required
Number of items.
uid_map: :obj:`OrderDict`, required
The dictionary containing mapping from user original ids to mapped integer indices.
iid_map: :obj:`OrderDict`, required
The dictionary containing mapping from item original ids to mapped integer indices.
uir_tuple: tuple, required
Tuple of 3 numpy arrays (user_indices, item_indices, rating_values).
timestamps: numpy.array, optional, default: None
Array of timestamps corresponding to observations in `uir_tuple`.
seed: int, optional, default: None
Random seed for reproducing data sampling.
Attributes
----------
num_ratings: int
Number of rating observations in the dataset.
max_rating: float
Maximum value among the rating observations.
min_rating: float
Minimum value among the rating observations.
global_mean: float
Average value over the rating observations.
uir_tuple: tuple
Tuple three numpy arrays (user_indices, item_indices, rating_values).
timestamps: numpy.array
Numpy array of timestamps corresponding to feedback in `uir_tuple`.
This is only available when input data is in `UIRT` format.
"""
def __init__(
self,
num_users,
num_items,
uid_map,
iid_map,
uir_tuple,
timestamps=None,
seed=None,
):
self.num_users = num_users
self.num_items = num_items
self.uid_map = uid_map
self.iid_map = iid_map
self.uir_tuple = uir_tuple
self.timestamps = timestamps
self.seed = seed
self.rng = get_rng(seed)
(_, _, r_values) = uir_tuple
self.num_ratings = len(r_values)
self.max_rating = np.max(r_values)
self.min_rating = np.min(r_values)
self.global_mean = np.mean(r_values)
self.__user_ids = None
self.__item_ids = None
self.__user_data = None
self.__item_data = None
self.__chrono_user_data = None
self.__chrono_item_data = None
self.__csr_matrix = None
self.__csc_matrix = None
self.__dok_matrix = None
self.ignored_attrs = [
"__user_ids",
"__item_ids",
"__user_data",
"__item_data",
"__chrono_user_data",
"__chrono_item_data",
"__csr_matrix",
"__csc_matrix",
"__dok_matrix",
]
@property
def user_ids(self):
"""Return the list of raw user ids"""
if self.__user_ids is None:
self.__user_ids = list(self.uid_map.keys())
return self.__user_ids
@property
def item_ids(self):
"""Return the list of raw item ids"""
if self.__item_ids is None:
self.__item_ids = list(self.iid_map.keys())
return self.__item_ids
@property
def user_data(self):
"""Data organized by user. A dictionary where keys are users,
values are tuples of two lists (items, ratings) interacted by the corresponding users.
"""
if self.__user_data is None:
self.__user_data = defaultdict()
for u, i, r in zip(*self.uir_tuple):
u_data = self.__user_data.setdefault(u, ([], []))
u_data[0].append(i)
u_data[1].append(r)
return self.__user_data
@property
def item_data(self):
"""Data organized by item. A dictionary where keys are items,
values are tuples of two lists (users, ratings) interacted with the corresponding items.
"""
if self.__item_data is None:
self.__item_data = defaultdict()
for u, i, r in zip(*self.uir_tuple):
i_data = self.__item_data.setdefault(i, ([], []))
i_data[0].append(u)
i_data[1].append(r)
return self.__item_data
@property
def chrono_user_data(self):
"""Data organized by user sorted chronologically (timestamps required).
A dictionary where keys are users, values are tuples of three chronologically
sorted lists (items, ratings, timestamps) interacted by the corresponding users.
"""
if self.timestamps is None:
raise ValueError("Timestamps are required but None!")
if self.__chrono_user_data is None:
self.__chrono_user_data = defaultdict()
for u, i, r, t in zip(*self.uir_tuple, self.timestamps):
u_data = self.__chrono_user_data.setdefault(u, ([], [], []))
u_data[0].append(i)
u_data[1].append(r)
u_data[2].append(t)
# sorting based on timestamps
for user, (items, ratings, timestamps) in self.__chrono_user_data.items():
sorted_idx = np.argsort(timestamps)
sorted_items = [items[i] for i in sorted_idx]
sorted_ratings = [ratings[i] for i in sorted_idx]
sorted_timestamps = [timestamps[i] for i in sorted_idx]
self.__chrono_user_data[user] = (
sorted_items,
sorted_ratings,
sorted_timestamps,
)
return self.__chrono_user_data
@property
def chrono_item_data(self):
"""Data organized by item sorted chronologically (timestamps required).
A dictionary where keys are items, values are tuples of three chronologically
sorted lists (users, ratings, timestamps) interacted with the corresponding items.
"""
if self.timestamps is None:
raise ValueError("Timestamps are required but None!")
if self.__chrono_item_data is None:
self.__chrono_item_data = defaultdict()
for u, i, r, t in zip(*self.uir_tuple, self.timestamps):
i_data = self.__chrono_item_data.setdefault(i, ([], [], []))
i_data[0].append(u)
i_data[1].append(r)
i_data[2].append(t)
# sorting based on timestamps
for item, (users, ratings, timestamps) in self.__chrono_item_data.items():
sorted_idx = np.argsort(timestamps)
sorted_users = [users[i] for i in sorted_idx]
sorted_ratings = [ratings[i] for i in sorted_idx]
sorted_timestamps = [timestamps[i] for i in sorted_idx]
self.__chrono_item_data[item] = (
sorted_users,
sorted_ratings,
sorted_timestamps,
)
return self.__chrono_item_data
@property
def matrix(self):
"""The user-item interaction matrix in CSR sparse format"""
return self.csr_matrix
@property
def csr_matrix(self):
"""The user-item interaction matrix in CSR sparse format"""
if self.__csr_matrix is None:
(u_indices, i_indices, r_values) = self.uir_tuple
self.__csr_matrix = csr_matrix(
(r_values, (u_indices, i_indices)),
shape=(self.num_users, self.num_items),
)
return self.__csr_matrix
@property
def csc_matrix(self):
"""The user-item interaction matrix in CSC sparse format"""
if self.__csc_matrix is None:
(u_indices, i_indices, r_values) = self.uir_tuple
self.__csc_matrix = csc_matrix(
(r_values, (u_indices, i_indices)),
shape=(self.num_users, self.num_items),
)
return self.__csc_matrix
@property
def dok_matrix(self):
"""The user-item interaction matrix in DOK sparse format"""
if self.__dok_matrix is None:
self.__dok_matrix = dok_matrix((self.num_users, self.num_items), dtype="float")
for u, i, r in zip(*self.uir_tuple):
self.__dok_matrix[u, i] = r
return self.__dok_matrix
[docs]
@classmethod
def build(
cls,
data,
fmt="UIR",
global_uid_map=None,
global_iid_map=None,
seed=None,
exclude_unknowns=False,
):
"""Constructing Dataset from given data of specific format.
Parameters
----------
data: array-like, required
Data in the form of triplets (user, item, rating) for UIR format,
or quadruplets (user, item, rating, timestamps) for UIRT format.
fmt: str, default: 'UIR'
Format of the input data. Currently, we are supporting:
'UIR': User, Item, Rating
'UIRT': User, Item, Rating, Timestamp
global_uid_map: :obj:`defaultdict`, optional, default: None
The dictionary containing global mapping from original ids to mapped ids of users.
global_iid_map: :obj:`defaultdict`, optional, default: None
The dictionary containing global mapping from original ids to mapped ids of items.
seed: int, optional, default: None
Random seed for reproducing data sampling.
exclude_unknowns: bool, default: False
Ignore unknown users and items.
Returns
-------
res: :obj:`<cornac.data.Dataset>`
Dataset object.
"""
fmt = validate_format(fmt, ["UIR", "UIRT"])
if global_uid_map is None:
global_uid_map = OrderedDict()
if global_iid_map is None:
global_iid_map = OrderedDict()
uid_map = OrderedDict()
iid_map = OrderedDict()
u_indices = []
i_indices = []
r_values = []
valid_idx = []
ui_set = set() # avoid duplicate observations
dup_count = 0
for idx, (uid, iid, rating, *_) in enumerate(data):
if exclude_unknowns and (uid not in global_uid_map or iid not in global_iid_map):
continue
if (uid, iid) in ui_set:
dup_count += 1
continue
ui_set.add((uid, iid))
uid_map[uid] = global_uid_map.setdefault(uid, len(global_uid_map))
iid_map[iid] = global_iid_map.setdefault(iid, len(global_iid_map))
u_indices.append(uid_map[uid])
i_indices.append(iid_map[iid])
r_values.append(float(rating))
valid_idx.append(idx)
if dup_count > 0:
warnings.warn("%d duplicated observations are removed!" % dup_count)
if len(ui_set) == 0:
raise ValueError("data is empty after being filtered!")
uir_tuple = (
np.asarray(u_indices, dtype="int"),
np.asarray(i_indices, dtype="int"),
np.asarray(r_values, dtype="float"),
)
timestamps = np.fromiter((int(data[i][3]) for i in valid_idx), dtype="int") if fmt == "UIRT" else None
dataset = cls(
num_users=len(global_uid_map),
num_items=len(global_iid_map),
uid_map=global_uid_map,
iid_map=global_iid_map,
uir_tuple=uir_tuple,
timestamps=timestamps,
seed=seed,
)
return dataset
[docs]
@classmethod
def from_uir(cls, data, seed=None):
"""Constructing Dataset from UIR (User, Item, Rating) triplet data.
Parameters
----------
data: array-like, shape: [n_examples, 3]
Data in the form of triplets (user, item, rating)
seed: int, optional, default: None
Random seed for reproducing data sampling.
Returns
-------
res: :obj:`<cornac.data.Dataset>`
Dataset object.
"""
return cls.build(data, fmt="UIR", seed=seed)
[docs]
@classmethod
def from_uirt(cls, data, seed=None):
"""Constructing Dataset from UIRT (User, Item, Rating, Timestamp)
quadruplet data.
Parameters
----------
data: array-like, shape: [n_examples, 4]
Data in the form of triplets (user, item, rating, timestamp)
seed: int, optional, default: None
Random seed for reproducing data sampling.
Returns
-------
res: :obj:`<cornac.data.Dataset>`
Dataset object.
"""
return cls.build(data, fmt="UIRT", seed=seed)
[docs]
def reset(self):
"""Reset the random number generator for reproducibility"""
self.rng = get_rng(self.seed)
return self
[docs]
def num_batches(self, batch_size):
"""Estimate number of batches per epoch"""
return estimate_batches(len(self.uir_tuple[0]), batch_size)
[docs]
def num_user_batches(self, batch_size):
"""Estimate number of batches per epoch iterating over users"""
return estimate_batches(self.num_users, batch_size)
[docs]
def num_item_batches(self, batch_size):
"""Estimate number of batches per epoch iterating over items"""
return estimate_batches(self.num_items, batch_size)
[docs]
def idx_iter(self, idx_range, batch_size=1, shuffle=False):
"""Create an iterator over batch of indices
Parameters
----------
batch_size: int, optional, default = 1
shuffle: bool, optional
If True, orders of triplets will be randomized. If False, default orders kept
Returns
-------
iterator : batch of indices (array of 'int')
"""
indices = np.arange(idx_range)
if shuffle:
self.rng.shuffle(indices)
n_batches = estimate_batches(len(indices), batch_size)
for b in range(n_batches):
start_offset = batch_size * b
end_offset = batch_size * b + batch_size
end_offset = min(end_offset, len(indices))
batch_ids = indices[start_offset:end_offset]
yield batch_ids
[docs]
def uir_iter(self, batch_size=1, shuffle=False, binary=False, num_zeros=0):
"""Create an iterator over data yielding batch of users, items, and rating values
Parameters
----------
batch_size: int, optional, default = 1
shuffle: bool, optional, default: False
If `True`, orders of triplets will be randomized. If `False`, default orders kept.
binary: bool, optional, default: False
If `True`, non-zero ratings will be turned into `1`, otherwise, values remain unchanged.
num_zeros: int, optional, default = 0
Number of unobserved ratings (zeros) to be added per user. This could be used
for negative sampling. By default, no values are added.
Returns
-------
iterator : batch of users (array of 'int'), batch of items (array of 'int'),
batch of ratings (array of 'float')
"""
for batch_ids in self.idx_iter(len(self.uir_tuple[0]), batch_size, shuffle):
batch_users = self.uir_tuple[0][batch_ids]
batch_items = self.uir_tuple[1][batch_ids]
if binary:
batch_ratings = np.ones_like(batch_items)
else:
batch_ratings = self.uir_tuple[2][batch_ids]
if num_zeros > 0:
repeated_users = batch_users.repeat(num_zeros)
neg_items = np.empty_like(repeated_users)
for i, u in enumerate(repeated_users):
j = self.rng.randint(0, self.num_items)
while self.dok_matrix[u, j] > 0:
j = self.rng.randint(0, self.num_items)
neg_items[i] = j
batch_users = np.concatenate((batch_users, repeated_users))
batch_items = np.concatenate((batch_items, neg_items))
batch_ratings = np.concatenate((batch_ratings, np.zeros_like(neg_items)))
yield batch_users, batch_items, batch_ratings
[docs]
def uij_iter(self, batch_size=1, shuffle=False, neg_sampling="uniform"):
"""Create an iterator over data yielding batch of users, positive items, and negative items
Parameters
----------
batch_size: int, optional, default = 1
shuffle: bool, optional, default: False
If `True`, orders of triplets will be randomized. If `False`, default orders kept.
neg_sampling: str, optional, default: 'uniform'
How negative item `j` will be sampled. Supported options: {`uniform`, `popularity`}.
Returns
-------
iterator : batch of users (array of 'int'), batch of positive items (array of 'int'),
batch of negative items (array of 'int')
"""
if neg_sampling.lower() == "uniform":
neg_population = np.arange(self.num_items)
elif neg_sampling.lower() == "popularity":
neg_population = self.uir_tuple[1]
else:
raise ValueError("Unsupported negative sampling option: {}".format(neg_sampling))
for batch_ids in self.idx_iter(len(self.uir_tuple[0]), batch_size, shuffle):
batch_users = self.uir_tuple[0][batch_ids]
batch_pos_items = self.uir_tuple[1][batch_ids]
batch_pos_ratings = self.uir_tuple[2][batch_ids]
batch_neg_items = np.empty_like(batch_pos_items)
for i, (user, pos_rating) in enumerate(zip(batch_users, batch_pos_ratings)):
neg_item = self.rng.choice(neg_population)
while self.dok_matrix[user, neg_item] >= pos_rating:
neg_item = self.rng.choice(neg_population)
batch_neg_items[i] = neg_item
yield batch_users, batch_pos_items, batch_neg_items
[docs]
def user_iter(self, batch_size=1, shuffle=False):
"""Create an iterator over user indices
Parameters
----------
batch_size : int, optional, default = 1
shuffle : bool, optional
If True, orders of triplets will be randomized. If False, default orders kept
Returns
-------
iterator : batch of user indices (array of 'int')
"""
user_indices = np.fromiter(set(self.uir_tuple[0]), dtype="int")
for batch_ids in self.idx_iter(len(user_indices), batch_size, shuffle):
yield user_indices[batch_ids]
[docs]
def item_iter(self, batch_size=1, shuffle=False):
"""Create an iterator over item indices
Parameters
----------
batch_size : int, optional, default = 1
shuffle : bool, optional
If True, orders of triplets will be randomized. If False, default orders kept
Returns
-------
iterator : batch of item indices (array of 'int')
"""
item_indices = np.fromiter(set(self.uir_tuple[1]), "int")
for batch_ids in self.idx_iter(len(item_indices), batch_size, shuffle):
yield item_indices[batch_ids]
def add_modalities(self, **kwargs):
self.user_feature = kwargs.get("user_feature", None)
self.item_feature = kwargs.get("item_feature", None)
self.user_text = kwargs.get("user_text", None)
self.item_text = kwargs.get("item_text", None)
self.user_image = kwargs.get("user_image", None)
self.item_image = kwargs.get("item_image", None)
self.user_graph = kwargs.get("user_graph", None)
self.item_graph = kwargs.get("item_graph", None)
self.sentiment = kwargs.get("sentiment", None)
self.review_text = kwargs.get("review_text", None)
def __deepcopy__(self, memo):
cls = self.__class__
result = cls.__new__(cls)
for k, v in self.__dict__.items():
if k in self.ignored_attrs:
continue
setattr(result, k, copy.deepcopy(v))
return result
[docs]
def save(self, fpath):
"""Save a dataset to the filesystem.
Parameters
----------
fpath: str, required
Path to a file for the dataset to be stored.
"""
os.makedirs(os.path.dirname(fpath), exist_ok=True)
dataset = copy.deepcopy(self)
pickle.dump(dataset, open(fpath, "wb"), protocol=pickle.HIGHEST_PROTOCOL)
[docs]
@staticmethod
def load(fpath):
"""Load a dataset from the filesystem.
Parameters
----------
fpath: str, required
Path to a file where the dataset is stored.
Returns
-------
self : object
"""
dataset = pickle.load(open(fpath, "rb"))
dataset.load_from = fpath # for further loading
return dataset
[docs]
class BasketDataset(Dataset):
"""Training set contains history baskets
Parameters
----------
num_users: int, required
Number of users.
num_items: int, required
Number of items.
uid_map: :obj:`OrderDict`, required
The dictionary containing mapping from user original ids to mapped integer indices.
iid_map: :obj:`OrderDict`, required
The dictionary containing mapping from item original ids to mapped integer indices.
uir_tuple: tuple, required
Tuple of 3 numpy arrays (user_indices, item_indices, rating_values).
basket_indices: numpy.array, required
Array of basket indices corresponding to observation in `uir_tuple`.
timestamps: numpy.array, optional, default: None
Numpy array of timestamps corresponding to feedback in `uir_tuple`.
This is only available when input data is in `UBIT` and `UBITJson` formats.
extra_data: numpy.array, optional, default: None
Array of json object corresponding to observations in `uir_tuple`.
seed: int, optional, default: None
Random seed for reproducing data sampling.
Attributes
----------
ubi_tuple: tuple
Tuple (user_indices, baskets).
timestamps: numpy.array
Numpy array of timestamps corresponding to feedback in `ubi_tuple`.
This is only available when input data is in `UTB` format.
"""
def __init__(
self,
num_users,
num_baskets,
num_items,
uid_map,
bid_map,
iid_map,
uir_tuple,
basket_indices=None,
timestamps=None,
extra_data=None,
seed=None,
):
super().__init__(
num_users=num_users,
num_items=num_items,
uid_map=uid_map,
iid_map=iid_map,
uir_tuple=uir_tuple,
timestamps=timestamps,
seed=seed,
)
self.num_baskets = num_baskets
self.bid_map = bid_map
self.basket_indices = basket_indices
self.extra_data = extra_data
basket_sizes = list(Counter(basket_indices).values())
self.max_basket_size = np.max(basket_sizes)
self.min_basket_size = np.min(basket_sizes)
self.avg_basket_size = np.mean(basket_sizes)
self.__baskets = None
self.__basket_ids = None
self.__user_basket_data = None
self.__chrono_user_basket_data = None
@property
def basket_ids(self):
"""Return the list of raw basket ids"""
if self.__basket_ids is None:
self.__basket_ids = list(self.bid_map.keys())
return self.__basket_ids
@property
def baskets(self):
"""A dictionary to store indices where basket ID appears in the data."""
if self.__baskets is None:
self.__baskets = defaultdict(list)
for idx, bid in enumerate(self.basket_indices):
self.__baskets[bid].append(idx)
return self.__baskets
@property
def user_basket_data(self):
"""Data organized by user. A dictionary where keys are users,
values are list of baskets purchased by corresponding users.
"""
if self.__user_basket_data is None:
self.__user_basket_data = defaultdict(list)
for bid, ids in self.baskets.items():
u = self.uir_tuple[0][ids[0]]
self.__user_basket_data[u].append(bid)
return self.__user_basket_data
@property
def chrono_user_basket_data(self):
"""Data organized by user sorted chronologically (timestamps required).
A dictionary where keys are users, values are tuples of three chronologically
sorted lists (baskets, timestamps) interacted by the corresponding users.
"""
if self.__chrono_user_basket_data is None:
assert self.timestamps is not None # we need timestamps
basket_timestamps = [self.timestamps[ids[0]] for ids in self.baskets.values()] # one-off
self.__chrono_user_basket_data = defaultdict(lambda: ([], []))
for (bid, ids), t in zip(self.baskets.items(), basket_timestamps):
u = self.uir_tuple[0][ids[0]]
self.__chrono_user_basket_data[u][0].append(bid)
self.__chrono_user_basket_data[u][1].append(t)
# sorting based on timestamps
for user, (baskets, timestamps) in self.__chrono_user_basket_data.items():
sorted_idx = np.argsort(timestamps)
sorted_baskets = [baskets[i] for i in sorted_idx]
sorted_timestamps = [timestamps[i] for i in sorted_idx]
self.__chrono_user_basket_data[user] = (
sorted_baskets,
sorted_timestamps,
)
return self.__chrono_user_basket_data
[docs]
@classmethod
def build(
cls,
data,
fmt="UBI",
global_uid_map=None,
global_bid_map=None,
global_iid_map=None,
seed=None,
exclude_unknowns=False,
):
"""Constructing Dataset from given data of specific format.
Parameters
----------
data: list, required
Data in the form of tuple (user, basket) for UB format,
or tuple (user, timestamps, basket) for UTB format.
fmt: str, default: 'UBI'
Format of the input data. Currently, we are supporting:
'UBI': User, Basket_ID, Item
'UBIT': User, Basket_ID, Item, Timestamp
'UBITJson': User, Basket_ID, Item, Timestamp, Extra data in Json format
global_uid_map: :obj:`defaultdict`, optional, default: None
The dictionary containing global mapping from original ids to mapped ids of users.
global_bid_map: :obj:`defaultdict`, optional, default: None
The dictionary containing global mapping from original ids to mapped ids of baskets.
global_iid_map: :obj:`defaultdict`, optional, default: None
The dictionary containing global mapping from original ids to mapped ids of items.
seed: int, optional, default: None
Random seed for reproducing data sampling.
exclude_unknowns: bool, default: False
Ignore unknown users and items.
Returns
-------
res: :obj:`<cornac.data.BasketDataset>`
BasketDataset object.
"""
fmt = validate_format(fmt, ["UBI", "UBIT", "UBITJson"])
if global_uid_map is None:
global_uid_map = OrderedDict()
if global_bid_map is None:
global_bid_map = OrderedDict()
if global_iid_map is None:
global_iid_map = OrderedDict()
u_indices = []
b_indices = []
i_indices = []
valid_idx = []
extra_data = []
for idx, (uid, bid, iid, *_) in enumerate(data):
if exclude_unknowns and (iid not in global_iid_map):
continue
global_uid_map.setdefault(uid, len(global_uid_map))
global_bid_map.setdefault(bid, len(global_bid_map))
global_iid_map.setdefault(iid, len(global_iid_map))
u_indices.append(global_uid_map[uid])
b_indices.append(global_bid_map[bid])
i_indices.append(global_iid_map[iid])
valid_idx.append(idx)
uir_tuple = (
np.asarray(u_indices, dtype="int"),
np.asarray(i_indices, dtype="int"),
np.ones(len(u_indices), dtype="float"),
)
basket_indices = np.asarray(b_indices, dtype="int")
timestamps = (
np.fromiter((int(data[i][3]) for i in valid_idx), dtype="int") if fmt in ["UBIT", "UBITJson"] else None
)
extra_data = [data[i][4] for i in valid_idx] if fmt == "UBITJson" else None
dataset = cls(
num_users=len(global_uid_map),
num_baskets=len(global_bid_map),
num_items=len(global_iid_map),
uid_map=global_uid_map,
bid_map=global_bid_map,
iid_map=global_iid_map,
uir_tuple=uir_tuple,
basket_indices=basket_indices,
timestamps=timestamps,
extra_data=extra_data,
seed=seed,
)
return dataset
[docs]
@classmethod
def from_ubi(cls, data, seed=None):
"""Constructing Dataset from UBI (User, Basket, Item) triples data.
Parameters
----------
data: list
Data in the form of tuples (user, basket, item).
seed: int, optional, default: None
Random seed for reproducing data sampling.
Returns
-------
res: :obj:`<cornac.data.BasketDataset>`
BasketDataset object.
"""
return cls.build(data, fmt="UBI", seed=seed)
[docs]
@classmethod
def from_ubit(cls, data, seed=None):
"""Constructing Dataset from UBIT format (User, Basket, Item, Timestamp)
Parameters
----------
data: tuple
Data in the form of quadruples (user, basket, item, timestamp)
seed: int, optional, default: None
Random seed for reproducing data sampling.
Returns
-------
res: :obj:`<cornac.data.BasketDataset>`
BasketDataset object.
"""
return cls.build(data, fmt="UBIT", seed=seed)
[docs]
@classmethod
def from_ubitjson(cls, data, seed=None):
"""Constructing Dataset from UBITJson format (User, Basket, Item, Timestamp, Json)
Parameters
----------
data: tuple
Data in the form of tuples (user, basket, item, timestamp, json)
seed: int, optional, default: None
Random seed for reproducing data sampling.
Returns
-------
res: :obj:`<cornac.data.BasketDataset>`
BasketDataset object.
"""
return cls.build(data, fmt="UBITJson", seed=seed)
[docs]
def ub_iter(self, batch_size=1, shuffle=False):
"""Create an iterator over data yielding batch of users and batch of baskets
Parameters
----------
batch_size: int, optional, default = 1
shuffle: bool, optional, default: False
If `True`, orders of users will be randomized. If `False`, default orders kept.
Returns
-------
iterator : batch of user indices, batch of baskets corresponding to user indices
"""
for batch_users in self.user_iter(batch_size, shuffle):
batch_baskets = [self.user_basket_data[uid] for uid in batch_users]
yield batch_users, batch_baskets
[docs]
def ubi_iter(self, batch_size=1, shuffle=False):
"""Create an iterator over data yielding batch of users, basket ids, and batch of the corresponding items
Parameters
----------
batch_size: int, optional, default = 1
shuffle: bool, optional, default: False
If `True`, orders of users will be randomized. If `False`, default orders kept.
Returns
-------
iterator : batch of user indices, batch of baskets corresponding to user indices, and batch of items correponding to baskets
"""
_, item_indices, _ = self.uir_tuple
for batch_users, batch_baskets in self.ub_iter(batch_size, shuffle):
batch_basket_items = [
[item_indices[self.baskets[bid]] for bid in user_baskets] for user_baskets in batch_baskets
]
yield batch_users, batch_baskets, batch_basket_items
[docs]
def basket_iter(self, batch_size=1, shuffle=False):
"""Create an iterator over data yielding batch of basket indices
Parameters
----------
batch_size: int, optional, default = 1
shuffle: bool, optional, default: False
If `True`, orders of triplets will be randomized. If `False`, default orders kept.
Returns
-------
iterator : batch of basket indices (array of 'int')
"""
basket_indices = np.fromiter(set(self.baskets.keys()), dtype="int")
for batch_ids in self.idx_iter(len(basket_indices), batch_size, shuffle):
yield basket_indices[batch_ids]
[docs]
class SequentialDataset(Dataset):
"""Training set contains history sessions
Parameters
----------
num_users: int, required
Number of users.
num_items: int, required
Number of items.
uid_map: :obj:`OrderDict`, required
The dictionary containing mapping from user original ids to mapped integer indices.
iid_map: :obj:`OrderDict`, required
The dictionary containing mapping from item original ids to mapped integer indices.
uir_tuple: tuple, required
Tuple of 3 numpy arrays (user_indices, item_indices, rating_values).
session_ids: numpy.array, required
Array of session indices corresponding to observation in `uir_tuple`.
timestamps: numpy.array, optional, default: None
Numpy array of timestamps corresponding to feedback in `uir_tuple`.
This is only available when input data is in `SIT`, `USIT`, SITJson`, and `USITJson` formats.
extra_data: numpy.array, optional, default: None
Array of json object corresponding to observations in `uir_tuple`.
seed: int, optional, default: None
Random seed for reproducing data sampling.
Attributes
----------
timestamps: numpy.array
Numpy array of timestamps corresponding to feedback in `ubi_tuple`.
This is only available when input data is in `UTB` format.
"""
def __init__(
self,
num_users,
num_sessions,
num_items,
uid_map,
sid_map,
iid_map,
uir_tuple,
session_indices=None,
timestamps=None,
extra_data=None,
seed=None,
):
super().__init__(
num_users=num_users,
num_items=num_items,
uid_map=uid_map,
iid_map=iid_map,
uir_tuple=uir_tuple,
timestamps=timestamps,
seed=seed,
)
self.num_sessions = num_sessions
self.sid_map = sid_map
self.session_indices = session_indices
self.extra_data = extra_data
session_sizes = list(Counter(session_indices).values())
self.max_session_size = np.max(session_sizes)
self.min_session_size = np.min(session_sizes)
self.avg_session_size = np.mean(session_sizes)
self.__sessions = None
self.__session_ids = None
self.__user_session_data = None
self.__chrono_user_session_data = None
@property
def session_ids(self):
"""Return the list of raw session ids"""
if self.__session_ids is None:
self.__session_ids = list(self.sid_map.keys())
return self.__session_ids
@property
def sessions(self):
"""A dictionary to store indices where session ID appears in the data."""
if self.__sessions is None:
self.__sessions = OrderedDict()
for idx, sid in enumerate(self.session_indices):
self.__sessions.setdefault(sid, [])
self.__sessions[sid].append(idx)
return self.__sessions
@property
def user_session_data(self):
"""Data organized by user. A dictionary where keys are users,
values are list of sessions purchased by corresponding users.
"""
if self.__user_session_data is None:
self.__user_session_data = defaultdict(list)
for sid, ids in self.sessions.items():
u = self.uir_tuple[0][ids[0]]
self.__user_session_data[u].append(sid)
return self.__user_session_data
@property
def chrono_user_session_data(self):
"""Data organized by user sorted chronologically (timestamps required).
A dictionary where keys are users, values are tuples of three chronologically
sorted lists (sessions, timestamps) interacted by the corresponding users.
"""
if self.__chrono_user_session_data is None:
assert self.timestamps is not None # we need timestamps
session_timestamps = [self.timestamps[ids[0]] for ids in self.sessions.values()] # one-off
self.__chrono_user_session_data = defaultdict(lambda: ([], []))
for (sid, ids), t in zip(self.sessions.items(), session_timestamps):
u = self.uir_tuple[0][ids[0]]
self.__chrono_user_session_data[u][0].append(sid)
self.__chrono_user_session_data[u][1].append(t)
# sorting based on timestamps
for user, (sessions, timestamps) in self.__chrono_user_session_data.items():
sorted_idx = np.argsort(timestamps)
sorted_sessions = [sessions[i] for i in sorted_idx]
sorted_timestamps = [timestamps[i] for i in sorted_idx]
self.__chrono_user_session_data[user] = (
sorted_sessions,
sorted_timestamps,
)
return self.__chrono_user_session_data
[docs]
@classmethod
def build(
cls,
data,
fmt="SIT",
global_uid_map=None,
global_sid_map=None,
global_iid_map=None,
seed=None,
exclude_unknowns=False,
):
"""Constructing Dataset from given data of specific format.
Parameters
----------
data: list, required
Data in the form of tuple (user, session) for UB format,
or tuple (user, timestamps, session) for UTB format.
fmt: str, default: 'SIT'
Format of the input data. Currently, we are supporting:
'SIT': Session_ID, Item, Timestamp
'USIT': User, Session_ID, Item, Timestamp
'SITJson': Session_ID, Item, Timestamp, Extra data in Json format
'USITJson': User, Session_ID, Item, Timestamp, Extra data in Json format
global_uid_map: :obj:`defaultdict`, optional, default: None
The dictionary containing global mapping from original ids to mapped ids of users.
global_sid_map: :obj:`defaultdict`, optional, default: None
The dictionary containing global mapping from original ids to mapped ids of sessions.
global_iid_map: :obj:`defaultdict`, optional, default: None
The dictionary containing global mapping from original ids to mapped ids of items.
seed: int, optional, default: None
Random seed for reproducing data sampling.
exclude_unknowns: bool, default: False
Ignore unknown users and items.
Returns
-------
res: :obj:`<cornac.data.SequentialDataset>`
SequentialDataset object.
"""
fmt = validate_format(fmt, ["SIT", "USIT", "SITJson", "USITJson"])
if global_uid_map is None:
global_uid_map = OrderedDict()
if global_sid_map is None:
global_sid_map = OrderedDict()
if global_iid_map is None:
global_iid_map = OrderedDict()
u_indices = []
s_indices = []
i_indices = []
valid_idx = []
extra_data = []
for idx, tup in enumerate(data):
uid, sid, iid, *_ = tup if fmt in ["USIT", "USITJson"] else [None] + list(tup)
if exclude_unknowns and (iid not in global_iid_map):
continue
global_uid_map.setdefault(uid, len(global_uid_map))
global_sid_map.setdefault(sid, len(global_sid_map))
global_iid_map.setdefault(iid, len(global_iid_map))
u_indices.append(global_uid_map[uid])
s_indices.append(global_sid_map[sid])
i_indices.append(global_iid_map[iid])
valid_idx.append(idx)
uir_tuple = (
np.asarray(u_indices, dtype="int"),
np.asarray(i_indices, dtype="int"),
np.ones(len(u_indices), dtype="float"),
)
session_indices = np.asarray(s_indices, dtype="int")
ts_pos = 3 if fmt in ["USIT", "USITJson"] else 2
timestamps = (
np.fromiter((int(data[i][ts_pos]) for i in valid_idx), dtype="int")
if fmt in ["SIT", "SITJson", "USIT", "USITJson"]
else None
)
extra_pos = ts_pos + 1
extra_data = [data[i][extra_pos] for i in valid_idx] if fmt in ["SITJson", "USITJson"] else None
dataset = cls(
num_users=len(global_uid_map),
num_sessions=len(set(session_indices)),
num_items=len(global_iid_map),
uid_map=global_uid_map,
sid_map=global_sid_map,
iid_map=global_iid_map,
uir_tuple=uir_tuple,
session_indices=session_indices,
timestamps=timestamps,
extra_data=extra_data,
seed=seed,
)
return dataset
[docs]
@classmethod
def from_sit(cls, data, seed=None):
"""Constructing Dataset from SIT (Session, Item, Timestamp) triples data.
Parameters
----------
data: list
Data in the form of tuples (session, item, timestamp).
seed: int, optional, default: None
Random seed for reproducing data sampling.
Returns
-------
res: :obj:`<cornac.data.SequentialDataset>`
SequentialDataset object.
"""
return cls.build(data, fmt="SIT", seed=seed)
[docs]
@classmethod
def from_usit(cls, data, seed=None):
"""Constructing Dataset from USIT format (User, Session, Item, Timestamp)
Parameters
----------
data: tuple
Data in the form of quadruples (user, session, item, timestamp)
seed: int, optional, default: None
Random seed for reproducing data sampling.
Returns
-------
res: :obj:`<cornac.data.SequentialDataset>`
SequentialDataset object.
"""
return cls.build(data, fmt="USIT", seed=seed)
[docs]
@classmethod
def from_sitjson(cls, data, seed=None):
"""Constructing Dataset from SITJson format (Session, Item, Timestamp, Json)
Parameters
----------
data: tuple
Data in the form of tuples (session, item, timestamp, json)
seed: int, optional, default: None
Random seed for reproducing data sampling.
Returns
-------
res: :obj:`<cornac.data.SequentialDataset>`
SequentialDataset object.
"""
return cls.build(data, fmt="SITJson", seed=seed)
[docs]
@classmethod
def from_usitjson(cls, data, seed=None):
"""Constructing Dataset from USITJson format (User, Session, Item, Timestamp, Json)
Parameters
----------
data: tuple
Data in the form of tuples (user, session, item, timestamp, json)
seed: int, optional, default: None
Random seed for reproducing data sampling.
Returns
-------
res: :obj:`<cornac.data.SequentialDataset>`
SequentialDataset object.
"""
return cls.build(data, fmt="USITJson", seed=seed)
[docs]
def num_batches(self, batch_size):
"""Estimate number of batches per epoch"""
return estimate_batches(len(self.sessions), batch_size)
[docs]
def session_iter(self, batch_size=1, shuffle=False):
"""Create an iterator over session indices
Parameters
----------
batch_size: int, optional, default = 1
shuffle: bool, optional, default: False
If `True`, orders of session_ids will be randomized. If `False`, default orders kept.
Returns
-------
iterator : batch of session indices (array of 'int')
"""
session_indices = np.array(list(self.sessions.keys()))
for batch_ids in self.idx_iter(len(session_indices), batch_size, shuffle):
batch_session_indices = session_indices[batch_ids]
yield batch_session_indices
[docs]
def s_iter(self, batch_size=1, shuffle=False):
"""Create an iterator over data yielding batch of sessions
Parameters
----------
batch_size: int, optional, default = 1
shuffle: bool, optional, default: False
If `True`, orders of sessions will be randomized. If `False`, default orders kept.
Returns
-------
iterator : batch of session indices, batch of indices corresponding to session indices
"""
for batch_session_ids in self.session_iter(batch_size, shuffle):
batch_mapped_ids = [self.sessions[sid] for sid in batch_session_ids]
yield batch_session_ids, batch_mapped_ids
[docs]
def si_iter(self, batch_size=1, shuffle=False):
"""Create an iterator over data yielding batch of session indices, batch of mapped ids, and batch of sessions' items
Parameters
----------
batch_size: int, optional, default = 1
shuffle: bool, optional, default: False
If `True`, orders of triplets will be randomized. If `False`, default orders kept.
Returns
-------
iterator : batch of session indices, batch mapped ids, batch of sessions' items (list of list)
"""
for batch_session_indices, batch_mapped_ids in self.s_iter(batch_size, shuffle):
batch_session_items = [[self.uir_tuple[1][i] for i in ids] for ids in batch_mapped_ids]
yield batch_session_indices, batch_mapped_ids, batch_session_items
[docs]
def usi_iter(self, batch_size=1, shuffle=False):
"""Create an iterator over data yielding batch of user indices, batch of session indices, batch of mapped ids, and batch of sessions' items
Parameters
----------
batch_size: int, optional, default = 1
shuffle: bool, optional, default: False
If `True`, orders of triplets will be randomized. If `False`, default orders kept.
Returns
-------
iterator : batch of user indices, batch of session indices (list of list), batch mapped ids (list of list of list), batch of sessions' items (list of list of list)
"""
for user_indices in self.user_iter(batch_size, shuffle):
batch_sids = [[sid for sid in self.user_session_data[uid]] for uid in user_indices]
batch_mapped_ids = [[self.sessions[sid] for sid in self.user_session_data[uid]] for uid in user_indices]
batch_session_items = [[[self.uir_tuple[1][i] for i in ids] for ids in u_batch_mapped_ids] for u_batch_mapped_ids in batch_mapped_ids]
yield user_indices, batch_sids, batch_mapped_ids, batch_session_items