Source code for myfm.utils.benchmark_data.movielens100k_data

from io import BytesIO
from pathlib import Path
from typing import List, Tuple

import pandas as pd

from .loader_base import MovieLensBase


[docs]class MovieLens100kDataManager(MovieLensBase): """The Data manager for MovieLens 100k dataset.""" @property def DOWNLOAD_URL(self) -> str: return "http://files.grouplens.org/datasets/movielens/ml-100k.zip" @property def DEFAULT_PATH(self) -> Path: return Path("~/.ml-100k.zip").expanduser() def _read_interaction(self, byte_stream: bytes) -> pd.DataFrame: with BytesIO(byte_stream) as ifs: data = pd.read_csv( ifs, sep="\t", header=None, names=["user_id", "movie_id", "rating", "timestamp"], ) data["timestamp"] = pd.to_datetime(data["timestamp"], unit="s") return data
[docs] def load_rating_all(self) -> pd.DataFrame: """Load the entire rating dataset. Returns ------- pd.DataFrame all the available ratings. """ return self._read_interaction(self.zf.read("ml-100k/u.data"))
[docs] def load_rating_predefined_split( self, fold: int, ) -> Tuple[pd.DataFrame, pd.DataFrame]: """Read the pre-defined train/test split. Fold index ranges from 1 to 5. Parameters ---------- fold : int specifies the fold index. Returns ------- Tuple[pd.DataFrame, pd.DataFrame] train and test dataframes. """ assert fold >= 1 and fold <= 5 train_path = "ml-100k/u{}.base".format(fold) test_path = "ml-100k/u{}.test".format(fold) df_train = self._read_interaction(self.zf.read(train_path)) df_test = self._read_interaction(self.zf.read(test_path)) return df_train, df_test
[docs] def load_user_info(self) -> pd.DataFrame: """load user meta information. Returns ------- pd.DataFrame user infomation """ user_info_bytes = self.zf.read("ml-100k/u.user") with BytesIO(user_info_bytes) as ifs: return pd.read_csv( ifs, sep="|", header=None, names=["user_id", "age", "gender", "occupation", "zipcode"], )
def genres(self) -> List[str]: with BytesIO(self.zf.read("ml-100k/u.genre")) as ifs: genres: List[str] = list(pd.read_csv(ifs, sep="|", header=None)[0]) return genres
[docs] def load_movie_info(self) -> pd.DataFrame: r"""load movie meta information. Returns ------- pd.DataFrame A dataframe containing meta-information (id, title, release_date, url, genres) about the movies. Multiple genres per movie will be concatenated by "|". """ MOVIE_COLUMNS = ["movie_id", "title", "release_date", "unk", "url"] genres = self.genres() with BytesIO(self.zf.read("ml-100k/u.item")) as ifs: df_mov = pd.read_csv( ifs, sep="|", encoding="latin-1", header=None, ) df_mov.columns = MOVIE_COLUMNS + genres df_mov["release_date"] = pd.to_datetime(df_mov.release_date) movie_index, genre_index = df_mov[genres].values.nonzero() genre_df = ( ( pd.DataFrame( dict( movie_id=df_mov.movie_id.values[movie_index], genre=[genres[i] for i in genre_index], ) ) .groupby("movie_id") .genre.agg(lambda x: "|".join(x)) ) .reindex(df_mov.movie_id) .fillna("") ) df_mov["genres"] = genre_df.values return df_mov