作者:爱你不愿放cwy | 来源:互联网 | 2023-08-28 01:53
在推荐模型中,有很多因素会影响id之外的特性是否有用:
上下文的重要性 :如果用户的首选项跨上下文和时间相对稳定,那么上下文特性可能不会提供太多好处。然而,如果用户偏好是高度上下文相关的,那么添加上下文将显著改善模型。例如,在决定是推荐一个短片还是一部电影时,一周的哪一天可能是一个重要的特征:用户在一周中可能只有时间看短内容,但可以在周末放松并享受一部完整长度的电影。类似地,查询时间戳可能在建模流行动态中扮演重要角色:一部电影可能在周围非常流行数据稀疏 :如果数据稀疏,使用非id特征可能是关键。由于对给定用户或项目的观察很少,模型可能难以估计每个用户或每个项目的良好表示。为了建立一个准确的模型,其他的特征,如项目类别、描述和图像,必须被用来帮助模型泛化训练数据以外的数据。这在冷启动的情况下尤其重要,因为在冷启动的情况下,一些项目或用户的数据相对较少。 import osimport tempfileimport numpy as npimport tensorflow as tfimport tensorflow_datasets as tfdsimport tensorflow_recommenders as tfrs
特征数据
ratings = tfds. load( "movielens/100k-ratings" , split= "train" ) movies = tfds. load( "movielens/100k-movies" , split= "train" ) ratings = ratings. map ( lambda x: { "movie_title" : x[ "movie_title" ] , "user_id" : x[ "user_id" ] , "timestamp" : x[ "timestamp" ] , } ) movies = movies. map ( lambda x: x[ "movie_title" ] )
特征词汇表
timestamps = np. concatenate( list ( ratings. map ( lambda x: x[ "timestamp" ] ) . batch( 100 ) ) ) max_timestamp = timestamps. max ( ) min_timestamp = timestamps. min ( ) timestamp_buckets = np. linspace( min_timestamp, max_timestamp, num= 1000 , ) unique_movie_titles = np. unique( np. concatenate( list ( movies. batch( 1000 ) ) ) ) unique_user_ids = np. unique( np. concatenate( list ( ratings. batch( 1_000) . map ( lambda x: x[ "user_id" ] ) ) ) )
定义模型 user model 代码中改动:增加是否使用时间戳这个特征的选择。
class UserModel ( tf. keras. Model) : def __init__ ( self, use_timestamps) : super ( ) . __init__( ) self. _use_timestamps = use_timestampsself. user_embedding = tf. keras. Sequential( [ tf. keras. layers. StringLookup( vocabulary= unique_user_ids, mask_token= None ) , tf. keras. layers. Embedding( len ( unique_user_ids) + 1 , 32 ) , ] ) if use_timestamps: self. timestamp_embedding = tf. keras. Sequential( [ tf. keras. layers. Discretization( timestamp_buckets. tolist( ) ) , tf. keras. layers. Embedding( len ( timestamp_buckets) + 1 , 32 ) , ] ) self. normalized_timestamp = tf. keras. layers. Normalization( axis= None ) self. normalized_timestamp. adapt( timestamps) def call ( self, inputs) : if not self. _use_timestamps: return self. user_embedding( inputs[ "user_id" ] ) return tf. concat( [ self. user_embedding( inputs[ "user_id" ] ) , self. timestamp_embedding( inputs[ "timestamp" ] ) , tf. reshape( self. normalized_timestamp( inputs[ "timestamp" ] ) , ( - 1 , 1 ) ) , ] , axis= 1 )
movie model
class MovieModel ( tf. keras. Model) : def __init__ ( self) : super ( ) . __init__( ) max_tokens = 10_000self. title_embedding = tf. keras. Sequential( [ tf. keras. layers. StringLookup( vocabulary= unique_movie_titles, mask_token= None ) , tf. keras. layers. Embedding( len ( unique_movie_titles) + 1 , 32 ) ] ) self. title_vectorizer = tf. keras. layers. TextVectorization( max_tokens= max_tokens) self. title_text_embedding = tf. keras. Sequential( [ self. title_vectorizer, tf. keras. layers. Embedding( max_tokens, 32 , mask_zero= True ) , tf. keras. layers. GlobalAveragePooling1D( ) , ] ) self. title_vectorizer. adapt( movies) def call ( self, titles) : return tf. concat( [ self. title_embedding( titles) , self. title_text_embedding( titles) , ] , axis= 1 )
组合模型
class MovielensModel ( tfrs. models. Model) : def __init__ ( self, use_timestamps) : super ( ) . __init__( ) self. query_model = tf. keras. Sequential( [ UserModel( use_timestamps) , tf. keras. layers. Dense( 32 ) ] ) self. candidate_model = tf. keras. Sequential( [ MovieModel( ) , tf. keras. layers. Dense( 32 ) ] ) self. task = tfrs. tasks. Retrieval( metrics= tfrs. metrics. FactorizedTopK( candidates= movies. batch( 128 ) . map ( self. candidate_model) , ) , ) def compute_loss ( self, features, training= False ) : query_embeddings = self. query_model( { "user_id" : features[ "user_id" ] , "timestamp" : features[ "timestamp" ] , } ) movie_embeddings = self. candidate_model( features[ "movie_title" ] ) return self. task( query_embeddings, movie_embeddings)
实验 准备数据集
tf. random. set_seed( 42 ) shuffled = ratings. shuffle( 100_000, seed= 42 , reshuffle_each_iteration= False ) train = shuffled. take( 80_000) test = shuffled. skip( 80_000) . take( 20_000) cached_train = train. shuffle( 100_000) . batch( 2048 ) cached_test = test. batch( 4096 ) . cache( )
Baseline :没有时间戳特性
model = MovielensModel( use_timestamps= False ) model. compile ( optimizer= tf. keras. optimizers. Adagrad( 0.1 ) ) model. fit( cached_train, epochs= 3 ) train_accuracy = model. evaluate( cached_train, return_dict= True ) [ "factorized_top_k/top_100_categorical_accuracy" ] test_accuracy = model. evaluate( cached_test, return_dict= True ) [ "factorized_top_k/top_100_categorical_accuracy" ] print ( f"Top-100 accuracy (train): { train_accuracy: .2f } ." ) print ( f"Top-100 accuracy (test): { test_accuracy: .2f } ." )
利用时间特征捕捉时间动态
model = MovielensModel( use_timestamps= True ) model. compile ( optimizer= tf. keras. optimizers. Adagrad( 0.1 ) ) model. fit( cached_train, epochs= 3 ) train_accuracy = model. evaluate( cached_train, return_dict= True ) [ "factorized_top_k/top_100_categorical_accuracy" ] test_accuracy = model. evaluate( cached_test, return_dict= True ) [ "factorized_top_k/top_100_categorical_accuracy" ] print ( f"Top-100 accuracy (train): { train_accuracy: .2f } ." ) print ( f"Top-100 accuracy (test): { test_accuracy: .2f } ." )