Spaces:
Sleeping
Sleeping
andykr1k commited on
Commit ·
07f113d
1
Parent(s): 93ab020
Added more post interactions
Browse files
app.py
CHANGED
|
@@ -13,6 +13,7 @@ import logging
|
|
| 13 |
from zoneinfo import ZoneInfo
|
| 14 |
from sentence_transformers import SentenceTransformer
|
| 15 |
from apscheduler.schedulers.background import BackgroundScheduler
|
|
|
|
| 16 |
|
| 17 |
# Configure logging
|
| 18 |
logging.basicConfig(level=logging.INFO)
|
|
@@ -55,12 +56,14 @@ sentence_model = SentenceTransformer('all-MiniLM-L6-v2', cache_folder='/tmp/cach
|
|
| 55 |
SUPABASE_URL = os.getenv('supabaseUrl')
|
| 56 |
SUPABASE_KEY = os.getenv('supabaseAnonKey')
|
| 57 |
|
|
|
|
| 58 |
def get_supabase_client():
|
| 59 |
global supabase_client
|
| 60 |
if supabase_client is None:
|
| 61 |
supabase_client = create_client(SUPABASE_URL, SUPABASE_KEY)
|
| 62 |
return supabase_client
|
| 63 |
|
|
|
|
| 64 |
def parse_datetime(dt_str: str) -> datetime:
|
| 65 |
"""Parse ISO datetime string and ensure correct microsecond precision."""
|
| 66 |
try:
|
|
@@ -69,7 +72,7 @@ def parse_datetime(dt_str: str) -> datetime:
|
|
| 69 |
time_part, tz_part = time_part.split('+')
|
| 70 |
if '.' in time_part:
|
| 71 |
time_without_micro, micro = time_part.split('.')
|
| 72 |
-
micro = micro.ljust(6, '0')
|
| 73 |
time_part = f"{time_without_micro}.{micro}"
|
| 74 |
dt_str = f"{date_part}T{time_part}+00:00"
|
| 75 |
return datetime.fromisoformat(dt_str).astimezone(TIMEZONE)
|
|
@@ -77,13 +80,46 @@ def parse_datetime(dt_str: str) -> datetime:
|
|
| 77 |
logger.error(f"Error parsing datetime: {dt_str} - {str(e)}")
|
| 78 |
raise
|
| 79 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 80 |
class Recommender:
|
| 81 |
-
def __init__(self
|
|
|
|
| 82 |
self.user_profiles = defaultdict(lambda: np.zeros(384))
|
| 83 |
self.post_popularity = defaultdict(float)
|
| 84 |
self.last_update = datetime.now(TIMEZONE) - HISTORY_WINDOW
|
| 85 |
|
| 86 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 87 |
"""Fetch all rows from a table using pagination."""
|
| 88 |
supabase = get_supabase_client()
|
| 89 |
page_size = 1000
|
|
@@ -91,42 +127,126 @@ class Recommender:
|
|
| 91 |
all_data = []
|
| 92 |
|
| 93 |
while True:
|
| 94 |
-
|
| 95 |
-
|
| 96 |
-
|
| 97 |
-
|
| 98 |
-
|
| 99 |
-
|
| 100 |
-
|
| 101 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 102 |
if not response.data:
|
| 103 |
break
|
| 104 |
-
|
| 105 |
all_data.extend(response.data)
|
| 106 |
page += 1
|
| 107 |
|
| 108 |
return all_data
|
| 109 |
|
| 110 |
async def update_data(self):
|
| 111 |
-
#
|
| 112 |
-
|
| 113 |
-
|
| 114 |
-
|
| 115 |
-
|
| 116 |
-
|
| 117 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 118 |
user_interactions[user_id].add(post_id)
|
| 119 |
-
self.post_popularity[post_id] +=
|
| 120 |
if post_id in post_features:
|
| 121 |
-
self.user_profiles[user_id] += post_features[post_id]
|
| 122 |
-
|
| 123 |
-
|
| 124 |
-
|
| 125 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 126 |
post_texts, post_ids = [], []
|
| 127 |
for post in posts:
|
| 128 |
post_id = post['id']
|
| 129 |
-
text = f"{post.get('movie_name', '')} {post.get('content', '')}".strip(
|
|
|
|
| 130 |
post_texts.append(text)
|
| 131 |
post_ids.append(post_id)
|
| 132 |
post['created_at'] = parse_datetime(post['created_at'])
|
|
@@ -134,17 +254,20 @@ class Recommender:
|
|
| 134 |
post_metadata[post_id] = post
|
| 135 |
|
| 136 |
if post_texts:
|
| 137 |
-
embeddings = sentence_model.encode(
|
|
|
|
| 138 |
for post_id, embedding in zip(post_ids, embeddings):
|
| 139 |
post_features[post_id] = embedding / np.linalg.norm(embedding)
|
| 140 |
|
| 141 |
self.last_update = datetime.now(TIMEZONE)
|
| 142 |
-
|
|
|
|
|
|
|
|
|
|
| 143 |
|
| 144 |
def get_recommendations(self, user_id: str) -> List[Dict]:
|
| 145 |
user_profile = self.user_profiles[user_id]
|
| 146 |
seen_posts = user_interactions[user_id]
|
| 147 |
-
|
| 148 |
scores = {}
|
| 149 |
now = datetime.now(TIMEZONE)
|
| 150 |
|
|
@@ -152,17 +275,20 @@ class Recommender:
|
|
| 152 |
if post_id in seen_posts:
|
| 153 |
continue
|
| 154 |
|
| 155 |
-
sim_score = np.dot(user_profile, feature) if np.any(
|
|
|
|
| 156 |
time_diff = now - post_metadata[post_id]['created_at']
|
| 157 |
-
|
| 158 |
-
|
| 159 |
-
score =
|
|
|
|
|
|
|
| 160 |
scores[post_id] = score + random.uniform(-0.1, 0.1)
|
| 161 |
|
| 162 |
-
top_posts = sorted(
|
|
|
|
| 163 |
results = [post_metadata[post_id] for post_id, _ in top_posts]
|
| 164 |
random.shuffle(results)
|
| 165 |
-
|
| 166 |
return results
|
| 167 |
|
| 168 |
recommender = Recommender()
|
|
|
|
| 13 |
from zoneinfo import ZoneInfo
|
| 14 |
from sentence_transformers import SentenceTransformer
|
| 15 |
from apscheduler.schedulers.background import BackgroundScheduler
|
| 16 |
+
import math
|
| 17 |
|
| 18 |
# Configure logging
|
| 19 |
logging.basicConfig(level=logging.INFO)
|
|
|
|
| 56 |
SUPABASE_URL = os.getenv('supabaseUrl')
|
| 57 |
SUPABASE_KEY = os.getenv('supabaseAnonKey')
|
| 58 |
|
| 59 |
+
|
| 60 |
def get_supabase_client():
|
| 61 |
global supabase_client
|
| 62 |
if supabase_client is None:
|
| 63 |
supabase_client = create_client(SUPABASE_URL, SUPABASE_KEY)
|
| 64 |
return supabase_client
|
| 65 |
|
| 66 |
+
|
| 67 |
def parse_datetime(dt_str: str) -> datetime:
|
| 68 |
"""Parse ISO datetime string and ensure correct microsecond precision."""
|
| 69 |
try:
|
|
|
|
| 72 |
time_part, tz_part = time_part.split('+')
|
| 73 |
if '.' in time_part:
|
| 74 |
time_without_micro, micro = time_part.split('.')
|
| 75 |
+
micro = micro.ljust(6, '0')
|
| 76 |
time_part = f"{time_without_micro}.{micro}"
|
| 77 |
dt_str = f"{date_part}T{time_part}+00:00"
|
| 78 |
return datetime.fromisoformat(dt_str).astimezone(TIMEZONE)
|
|
|
|
| 80 |
logger.error(f"Error parsing datetime: {dt_str} - {str(e)}")
|
| 81 |
raise
|
| 82 |
|
| 83 |
+
|
| 84 |
+
def decay_weight(interaction_time: datetime, current_time: datetime, decay_rate: float = 0.0001) -> float:
|
| 85 |
+
"""
|
| 86 |
+
Compute an exponential decay weight for an interaction based on its age.
|
| 87 |
+
The decay_rate can be tuned to control how fast interactions lose weight.
|
| 88 |
+
"""
|
| 89 |
+
time_diff = (current_time - interaction_time).total_seconds()
|
| 90 |
+
return math.exp(-decay_rate * time_diff)
|
| 91 |
+
|
| 92 |
+
|
| 93 |
+
def normalize_profile(profile: np.ndarray) -> np.ndarray:
|
| 94 |
+
norm = np.linalg.norm(profile)
|
| 95 |
+
return profile / norm if norm > 0 else profile
|
| 96 |
+
|
| 97 |
+
|
| 98 |
+
def compute_score(sim_score: float, popularity: float, freshness: float) -> float:
|
| 99 |
+
"""
|
| 100 |
+
Compute a non-linear recommendation score.
|
| 101 |
+
- sim_score is squared to emphasize strong similarities.
|
| 102 |
+
- Popularity is log-transformed.
|
| 103 |
+
- Freshness is combined linearly.
|
| 104 |
+
"""
|
| 105 |
+
return 0.6 * (sim_score ** 2) + 0.3 * np.log1p(popularity) + 0.1 * freshness
|
| 106 |
+
|
| 107 |
+
|
| 108 |
class Recommender:
|
| 109 |
+
def __init__(self, like_weight=1.0, comment_weight=0.5, comment_like_weight=0.3,
|
| 110 |
+
reply_weight=0.5, reply_like_weight=0.3):
|
| 111 |
self.user_profiles = defaultdict(lambda: np.zeros(384))
|
| 112 |
self.post_popularity = defaultdict(float)
|
| 113 |
self.last_update = datetime.now(TIMEZONE) - HISTORY_WINDOW
|
| 114 |
|
| 115 |
+
# Parameterized weights
|
| 116 |
+
self.like_weight = like_weight
|
| 117 |
+
self.comment_weight = comment_weight
|
| 118 |
+
self.comment_like_weight = comment_like_weight
|
| 119 |
+
self.reply_weight = reply_weight
|
| 120 |
+
self.reply_like_weight = reply_like_weight
|
| 121 |
+
|
| 122 |
+
async def fetch_all_rows(self, table_name: str, columns: str, last_update: datetime, post_id_not_null: bool):
|
| 123 |
"""Fetch all rows from a table using pagination."""
|
| 124 |
supabase = get_supabase_client()
|
| 125 |
page_size = 1000
|
|
|
|
| 127 |
all_data = []
|
| 128 |
|
| 129 |
while True:
|
| 130 |
+
if post_id_not_null:
|
| 131 |
+
response = await asyncio.to_thread(
|
| 132 |
+
supabase.table(table_name)
|
| 133 |
+
.select(columns)
|
| 134 |
+
.gt('created_at', last_update.isoformat())
|
| 135 |
+
.not_.is_("post_id", None)
|
| 136 |
+
.range(page * page_size, (page + 1) * page_size - 1)
|
| 137 |
+
.execute
|
| 138 |
+
)
|
| 139 |
+
else:
|
| 140 |
+
response = await asyncio.to_thread(
|
| 141 |
+
supabase.table(table_name)
|
| 142 |
+
.select(columns)
|
| 143 |
+
.gt('created_at', last_update.isoformat())
|
| 144 |
+
.range(page * page_size, (page + 1) * page_size - 1)
|
| 145 |
+
.execute
|
| 146 |
+
)
|
| 147 |
+
|
| 148 |
if not response.data:
|
| 149 |
break
|
| 150 |
+
|
| 151 |
all_data.extend(response.data)
|
| 152 |
page += 1
|
| 153 |
|
| 154 |
return all_data
|
| 155 |
|
| 156 |
async def update_data(self):
|
| 157 |
+
# Current time for decay calculation
|
| 158 |
+
current_time = datetime.now(TIMEZONE)
|
| 159 |
+
|
| 160 |
+
# Fetch all interaction data since last update
|
| 161 |
+
likes = await self.fetch_all_rows('likes', 'user_id, post_id, created_at', self.last_update, True)
|
| 162 |
+
comments = await self.fetch_all_rows('comments', 'id, user_id, post_id, created_at, comment', self.last_update, True)
|
| 163 |
+
commentlikes = await self.fetch_all_rows('commentlikes', 'user_id, author_id, post_id, created_at, comment_id', self.last_update, True)
|
| 164 |
+
replies = await self.fetch_all_rows('replies', 'user_id, to, post_id, created_at, comment, comment_id, reply_id', self.last_update, True)
|
| 165 |
+
replylikes = await self.fetch_all_rows('replylikes', 'user_id, author_id, post_id, created_at, reply_id', self.last_update, True)
|
| 166 |
+
|
| 167 |
+
# Process likes with time decay
|
| 168 |
+
for like in likes:
|
| 169 |
+
user_id = like['user_id']
|
| 170 |
+
post_id = like['post_id']
|
| 171 |
+
interaction_time = parse_datetime(like['created_at'])
|
| 172 |
+
weight = decay_weight(interaction_time, current_time)
|
| 173 |
user_interactions[user_id].add(post_id)
|
| 174 |
+
self.post_popularity[post_id] += self.like_weight * weight
|
| 175 |
if post_id in post_features:
|
| 176 |
+
self.user_profiles[user_id] += post_features[post_id] * \
|
| 177 |
+
self.like_weight * weight
|
| 178 |
+
|
| 179 |
+
# Process comments with time decay
|
| 180 |
+
for comment in comments:
|
| 181 |
+
user_id = comment['user_id']
|
| 182 |
+
post_id = comment['post_id']
|
| 183 |
+
interaction_time = parse_datetime(comment['created_at'])
|
| 184 |
+
weight = decay_weight(interaction_time, current_time)
|
| 185 |
+
user_interactions[user_id].add(post_id)
|
| 186 |
+
self.post_popularity[post_id] += self.comment_weight * weight
|
| 187 |
+
if post_id in post_features:
|
| 188 |
+
self.user_profiles[user_id] += post_features[post_id] * \
|
| 189 |
+
self.comment_weight * weight
|
| 190 |
+
|
| 191 |
+
# Process comment likes with time decay
|
| 192 |
+
for clike in commentlikes:
|
| 193 |
+
user_id = clike['user_id'] # User who liked the comment
|
| 194 |
+
post_id = clike['post_id']
|
| 195 |
+
interaction_time = parse_datetime(clike['created_at'])
|
| 196 |
+
weight = decay_weight(interaction_time, current_time)
|
| 197 |
+
user_interactions[user_id].add(post_id)
|
| 198 |
+
self.post_popularity[post_id] += self.comment_like_weight * weight
|
| 199 |
+
if post_id in post_features:
|
| 200 |
+
self.user_profiles[user_id] += post_features[post_id] * \
|
| 201 |
+
self.comment_like_weight * weight
|
| 202 |
+
|
| 203 |
+
# Process replies with time decay
|
| 204 |
+
for reply in replies:
|
| 205 |
+
user_id = reply['user_id']
|
| 206 |
+
post_id = reply['post_id']
|
| 207 |
+
interaction_time = parse_datetime(reply['created_at'])
|
| 208 |
+
weight = decay_weight(interaction_time, current_time)
|
| 209 |
+
user_interactions[user_id].add(post_id)
|
| 210 |
+
self.post_popularity[post_id] += self.reply_weight * weight
|
| 211 |
+
if post_id in post_features:
|
| 212 |
+
self.user_profiles[user_id] += post_features[post_id] * \
|
| 213 |
+
self.reply_weight * weight
|
| 214 |
+
|
| 215 |
+
# Process reply likes with time decay
|
| 216 |
+
for rlike in replylikes:
|
| 217 |
+
user_id = rlike['user_id']
|
| 218 |
+
post_id = rlike['post_id']
|
| 219 |
+
interaction_time = parse_datetime(rlike['created_at'])
|
| 220 |
+
weight = decay_weight(interaction_time, current_time)
|
| 221 |
+
user_interactions[user_id].add(post_id)
|
| 222 |
+
self.post_popularity[post_id] += self.reply_like_weight * weight
|
| 223 |
+
if post_id in post_features:
|
| 224 |
+
self.user_profiles[user_id] += post_features[post_id] * \
|
| 225 |
+
self.reply_like_weight * weight
|
| 226 |
+
|
| 227 |
+
# OPTIONAL: Process negative feedback if available
|
| 228 |
+
# for negative in negative_interactions:
|
| 229 |
+
# user_id = negative['user_id']
|
| 230 |
+
# post_id = negative['post_id']
|
| 231 |
+
# interaction_time = parse_datetime(negative['created_at'])
|
| 232 |
+
# weight = decay_weight(interaction_time, current_time)
|
| 233 |
+
# user_interactions[user_id].add(post_id)
|
| 234 |
+
# self.post_popularity[post_id] -= some_negative_weight * weight
|
| 235 |
+
# if post_id in post_features:
|
| 236 |
+
# self.user_profiles[user_id] -= post_features[post_id] * some_negative_weight * weight
|
| 237 |
+
|
| 238 |
+
# Normalize user profiles after processing interactions
|
| 239 |
+
for user_id in self.user_profiles:
|
| 240 |
+
self.user_profiles[user_id] = normalize_profile(
|
| 241 |
+
self.user_profiles[user_id])
|
| 242 |
+
|
| 243 |
+
# Fetch and update post features
|
| 244 |
+
posts = await self.fetch_all_rows('posts', '*', self.last_update, False)
|
| 245 |
post_texts, post_ids = [], []
|
| 246 |
for post in posts:
|
| 247 |
post_id = post['id']
|
| 248 |
+
text = f"{post.get('movie_name', '')} {post.get('content', '')}".strip(
|
| 249 |
+
)
|
| 250 |
post_texts.append(text)
|
| 251 |
post_ids.append(post_id)
|
| 252 |
post['created_at'] = parse_datetime(post['created_at'])
|
|
|
|
| 254 |
post_metadata[post_id] = post
|
| 255 |
|
| 256 |
if post_texts:
|
| 257 |
+
embeddings = sentence_model.encode(
|
| 258 |
+
post_texts, show_progress_bar=False, convert_to_numpy=True)
|
| 259 |
for post_id, embedding in zip(post_ids, embeddings):
|
| 260 |
post_features[post_id] = embedding / np.linalg.norm(embedding)
|
| 261 |
|
| 262 |
self.last_update = datetime.now(TIMEZONE)
|
| 263 |
+
total_interactions = len(likes) + len(comments) + \
|
| 264 |
+
len(commentlikes) + len(replies) + len(replylikes)
|
| 265 |
+
logger.info(
|
| 266 |
+
f"Data updated: {len(posts)} posts, {total_interactions} interactions")
|
| 267 |
|
| 268 |
def get_recommendations(self, user_id: str) -> List[Dict]:
|
| 269 |
user_profile = self.user_profiles[user_id]
|
| 270 |
seen_posts = user_interactions[user_id]
|
|
|
|
| 271 |
scores = {}
|
| 272 |
now = datetime.now(TIMEZONE)
|
| 273 |
|
|
|
|
| 275 |
if post_id in seen_posts:
|
| 276 |
continue
|
| 277 |
|
| 278 |
+
sim_score = np.dot(user_profile, feature) if np.any(
|
| 279 |
+
user_profile) else 0
|
| 280 |
time_diff = now - post_metadata[post_id]['created_at']
|
| 281 |
+
# Freshness is defined as an exponential decay based on weeks old
|
| 282 |
+
freshness = math.exp(-time_diff.days / 7.0)
|
| 283 |
+
score = compute_score(
|
| 284 |
+
sim_score, self.post_popularity[post_id], freshness)
|
| 285 |
+
# Adding a small random noise for exploration
|
| 286 |
scores[post_id] = score + random.uniform(-0.1, 0.1)
|
| 287 |
|
| 288 |
+
top_posts = sorted(
|
| 289 |
+
scores.items(), key=lambda x: x[1], reverse=True)[:TOP_K]
|
| 290 |
results = [post_metadata[post_id] for post_id, _ in top_posts]
|
| 291 |
random.shuffle(results)
|
|
|
|
| 292 |
return results
|
| 293 |
|
| 294 |
recommender = Recommender()
|
test.py
ADDED
|
@@ -0,0 +1,297 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import math
|
| 3 |
+
import random
|
| 4 |
+
import asyncio
|
| 5 |
+
import logging
|
| 6 |
+
from collections import defaultdict
|
| 7 |
+
from datetime import datetime, timedelta
|
| 8 |
+
from zoneinfo import ZoneInfo
|
| 9 |
+
from typing import List, Dict
|
| 10 |
+
|
| 11 |
+
import numpy as np
|
| 12 |
+
from dotenv import load_dotenv
|
| 13 |
+
from supabase import create_client
|
| 14 |
+
from sentence_transformers import SentenceTransformer
|
| 15 |
+
import pdb
|
| 16 |
+
|
| 17 |
+
# Configure logging
|
| 18 |
+
logging.basicConfig(level=logging.INFO)
|
| 19 |
+
logger = logging.getLogger(__name__)
|
| 20 |
+
|
| 21 |
+
# Load environment variables
|
| 22 |
+
load_dotenv()
|
| 23 |
+
|
| 24 |
+
# Configuration
|
| 25 |
+
SEED = 42
|
| 26 |
+
random.seed(SEED)
|
| 27 |
+
np.random.seed(SEED)
|
| 28 |
+
|
| 29 |
+
TOP_K = 75
|
| 30 |
+
HISTORY_WINDOW = timedelta(days=1000)
|
| 31 |
+
TIMEZONE = ZoneInfo("UTC")
|
| 32 |
+
|
| 33 |
+
# Global variables
|
| 34 |
+
supabase_client = None
|
| 35 |
+
user_interactions = defaultdict(set)
|
| 36 |
+
post_features = {}
|
| 37 |
+
post_metadata = {}
|
| 38 |
+
|
| 39 |
+
if not os.path.exists('/tmp/cache'):
|
| 40 |
+
os.makedirs('/tmp/cache')
|
| 41 |
+
sentence_model = SentenceTransformer(
|
| 42 |
+
'all-MiniLM-L6-v2', cache_folder='/tmp/cache')
|
| 43 |
+
|
| 44 |
+
SUPABASE_URL = os.getenv('supabaseUrl')
|
| 45 |
+
SUPABASE_KEY = os.getenv('supabaseAnonKey')
|
| 46 |
+
|
| 47 |
+
|
| 48 |
+
def get_supabase_client():
|
| 49 |
+
global supabase_client
|
| 50 |
+
if supabase_client is None:
|
| 51 |
+
supabase_client = create_client(SUPABASE_URL, SUPABASE_KEY)
|
| 52 |
+
return supabase_client
|
| 53 |
+
|
| 54 |
+
|
| 55 |
+
def parse_datetime(dt_str: str) -> datetime:
|
| 56 |
+
"""Parse ISO datetime string and ensure correct microsecond precision."""
|
| 57 |
+
try:
|
| 58 |
+
if '.' in dt_str:
|
| 59 |
+
date_part, time_part = dt_str.split('T')
|
| 60 |
+
time_part, tz_part = time_part.split('+')
|
| 61 |
+
if '.' in time_part:
|
| 62 |
+
time_without_micro, micro = time_part.split('.')
|
| 63 |
+
micro = micro.ljust(6, '0')
|
| 64 |
+
time_part = f"{time_without_micro}.{micro}"
|
| 65 |
+
dt_str = f"{date_part}T{time_part}+00:00"
|
| 66 |
+
return datetime.fromisoformat(dt_str).astimezone(TIMEZONE)
|
| 67 |
+
except Exception as e:
|
| 68 |
+
logger.error(f"Error parsing datetime: {dt_str} - {str(e)}")
|
| 69 |
+
raise
|
| 70 |
+
|
| 71 |
+
|
| 72 |
+
def decay_weight(interaction_time: datetime, current_time: datetime, decay_rate: float = 0.0001) -> float:
|
| 73 |
+
"""
|
| 74 |
+
Compute an exponential decay weight for an interaction based on its age.
|
| 75 |
+
The decay_rate can be tuned to control how fast interactions lose weight.
|
| 76 |
+
"""
|
| 77 |
+
time_diff = (current_time - interaction_time).total_seconds()
|
| 78 |
+
return math.exp(-decay_rate * time_diff)
|
| 79 |
+
|
| 80 |
+
|
| 81 |
+
def normalize_profile(profile: np.ndarray) -> np.ndarray:
|
| 82 |
+
norm = np.linalg.norm(profile)
|
| 83 |
+
return profile / norm if norm > 0 else profile
|
| 84 |
+
|
| 85 |
+
|
| 86 |
+
def compute_score(sim_score: float, popularity: float, freshness: float) -> float:
|
| 87 |
+
"""
|
| 88 |
+
Compute a non-linear recommendation score.
|
| 89 |
+
- sim_score is squared to emphasize strong similarities.
|
| 90 |
+
- Popularity is log-transformed.
|
| 91 |
+
- Freshness is combined linearly.
|
| 92 |
+
"""
|
| 93 |
+
return 0.6 * (sim_score ** 2) + 0.3 * np.log1p(popularity) + 0.1 * freshness
|
| 94 |
+
|
| 95 |
+
|
| 96 |
+
class Recommender:
|
| 97 |
+
def __init__(self, like_weight=1.0, comment_weight=0.5, comment_like_weight=0.3,
|
| 98 |
+
reply_weight=0.5, reply_like_weight=0.3):
|
| 99 |
+
self.user_profiles = defaultdict(lambda: np.zeros(384))
|
| 100 |
+
self.post_popularity = defaultdict(float)
|
| 101 |
+
self.last_update = datetime.now(TIMEZONE) - HISTORY_WINDOW
|
| 102 |
+
|
| 103 |
+
# Parameterized weights
|
| 104 |
+
self.like_weight = like_weight
|
| 105 |
+
self.comment_weight = comment_weight
|
| 106 |
+
self.comment_like_weight = comment_like_weight
|
| 107 |
+
self.reply_weight = reply_weight
|
| 108 |
+
self.reply_like_weight = reply_like_weight
|
| 109 |
+
|
| 110 |
+
async def fetch_all_rows(self, table_name: str, columns: str, last_update: datetime, post_id_not_null: bool):
|
| 111 |
+
"""Fetch all rows from a table using pagination."""
|
| 112 |
+
supabase = get_supabase_client()
|
| 113 |
+
page_size = 1000
|
| 114 |
+
page = 0
|
| 115 |
+
all_data = []
|
| 116 |
+
|
| 117 |
+
while True:
|
| 118 |
+
if post_id_not_null:
|
| 119 |
+
response = await asyncio.to_thread(
|
| 120 |
+
supabase.table(table_name)
|
| 121 |
+
.select(columns)
|
| 122 |
+
.gt('created_at', last_update.isoformat())
|
| 123 |
+
.not_.is_("post_id", None)
|
| 124 |
+
.range(page * page_size, (page + 1) * page_size - 1)
|
| 125 |
+
.execute
|
| 126 |
+
)
|
| 127 |
+
else:
|
| 128 |
+
response = await asyncio.to_thread(
|
| 129 |
+
supabase.table(table_name)
|
| 130 |
+
.select(columns)
|
| 131 |
+
.gt('created_at', last_update.isoformat())
|
| 132 |
+
.range(page * page_size, (page + 1) * page_size - 1)
|
| 133 |
+
.execute
|
| 134 |
+
)
|
| 135 |
+
|
| 136 |
+
if not response.data:
|
| 137 |
+
break
|
| 138 |
+
|
| 139 |
+
all_data.extend(response.data)
|
| 140 |
+
page += 1
|
| 141 |
+
|
| 142 |
+
return all_data
|
| 143 |
+
|
| 144 |
+
async def update_data(self):
|
| 145 |
+
# Current time for decay calculation
|
| 146 |
+
current_time = datetime.now(TIMEZONE)
|
| 147 |
+
|
| 148 |
+
# Fetch all interaction data since last update
|
| 149 |
+
likes = await self.fetch_all_rows('likes', 'user_id, post_id, created_at', self.last_update, True)
|
| 150 |
+
comments = await self.fetch_all_rows('comments', 'id, user_id, post_id, created_at, comment', self.last_update, True)
|
| 151 |
+
commentlikes = await self.fetch_all_rows('commentlikes', 'user_id, author_id, post_id, created_at, comment_id', self.last_update, True)
|
| 152 |
+
replies = await self.fetch_all_rows('replies', 'user_id, to, post_id, created_at, comment, comment_id, reply_id', self.last_update, True)
|
| 153 |
+
replylikes = await self.fetch_all_rows('replylikes', 'user_id, author_id, post_id, created_at, reply_id', self.last_update, True)
|
| 154 |
+
|
| 155 |
+
# Process likes with time decay
|
| 156 |
+
for like in likes:
|
| 157 |
+
user_id = like['user_id']
|
| 158 |
+
post_id = like['post_id']
|
| 159 |
+
interaction_time = parse_datetime(like['created_at'])
|
| 160 |
+
weight = decay_weight(interaction_time, current_time)
|
| 161 |
+
user_interactions[user_id].add(post_id)
|
| 162 |
+
self.post_popularity[post_id] += self.like_weight * weight
|
| 163 |
+
if post_id in post_features:
|
| 164 |
+
self.user_profiles[user_id] += post_features[post_id] * \
|
| 165 |
+
self.like_weight * weight
|
| 166 |
+
|
| 167 |
+
# Process comments with time decay
|
| 168 |
+
for comment in comments:
|
| 169 |
+
user_id = comment['user_id']
|
| 170 |
+
post_id = comment['post_id']
|
| 171 |
+
interaction_time = parse_datetime(comment['created_at'])
|
| 172 |
+
weight = decay_weight(interaction_time, current_time)
|
| 173 |
+
user_interactions[user_id].add(post_id)
|
| 174 |
+
self.post_popularity[post_id] += self.comment_weight * weight
|
| 175 |
+
if post_id in post_features:
|
| 176 |
+
self.user_profiles[user_id] += post_features[post_id] * \
|
| 177 |
+
self.comment_weight * weight
|
| 178 |
+
|
| 179 |
+
# Process comment likes with time decay
|
| 180 |
+
for clike in commentlikes:
|
| 181 |
+
user_id = clike['user_id'] # User who liked the comment
|
| 182 |
+
post_id = clike['post_id']
|
| 183 |
+
interaction_time = parse_datetime(clike['created_at'])
|
| 184 |
+
weight = decay_weight(interaction_time, current_time)
|
| 185 |
+
user_interactions[user_id].add(post_id)
|
| 186 |
+
self.post_popularity[post_id] += self.comment_like_weight * weight
|
| 187 |
+
if post_id in post_features:
|
| 188 |
+
self.user_profiles[user_id] += post_features[post_id] * \
|
| 189 |
+
self.comment_like_weight * weight
|
| 190 |
+
|
| 191 |
+
# Process replies with time decay
|
| 192 |
+
for reply in replies:
|
| 193 |
+
user_id = reply['user_id']
|
| 194 |
+
post_id = reply['post_id']
|
| 195 |
+
interaction_time = parse_datetime(reply['created_at'])
|
| 196 |
+
weight = decay_weight(interaction_time, current_time)
|
| 197 |
+
user_interactions[user_id].add(post_id)
|
| 198 |
+
self.post_popularity[post_id] += self.reply_weight * weight
|
| 199 |
+
if post_id in post_features:
|
| 200 |
+
self.user_profiles[user_id] += post_features[post_id] * \
|
| 201 |
+
self.reply_weight * weight
|
| 202 |
+
|
| 203 |
+
# Process reply likes with time decay
|
| 204 |
+
for rlike in replylikes:
|
| 205 |
+
user_id = rlike['user_id']
|
| 206 |
+
post_id = rlike['post_id']
|
| 207 |
+
interaction_time = parse_datetime(rlike['created_at'])
|
| 208 |
+
weight = decay_weight(interaction_time, current_time)
|
| 209 |
+
user_interactions[user_id].add(post_id)
|
| 210 |
+
self.post_popularity[post_id] += self.reply_like_weight * weight
|
| 211 |
+
if post_id in post_features:
|
| 212 |
+
self.user_profiles[user_id] += post_features[post_id] * \
|
| 213 |
+
self.reply_like_weight * weight
|
| 214 |
+
|
| 215 |
+
# OPTIONAL: Process negative feedback if available
|
| 216 |
+
# for negative in negative_interactions:
|
| 217 |
+
# user_id = negative['user_id']
|
| 218 |
+
# post_id = negative['post_id']
|
| 219 |
+
# interaction_time = parse_datetime(negative['created_at'])
|
| 220 |
+
# weight = decay_weight(interaction_time, current_time)
|
| 221 |
+
# user_interactions[user_id].add(post_id)
|
| 222 |
+
# self.post_popularity[post_id] -= some_negative_weight * weight
|
| 223 |
+
# if post_id in post_features:
|
| 224 |
+
# self.user_profiles[user_id] -= post_features[post_id] * some_negative_weight * weight
|
| 225 |
+
|
| 226 |
+
# Normalize user profiles after processing interactions
|
| 227 |
+
for user_id in self.user_profiles:
|
| 228 |
+
self.user_profiles[user_id] = normalize_profile(
|
| 229 |
+
self.user_profiles[user_id])
|
| 230 |
+
|
| 231 |
+
# Fetch and update post features
|
| 232 |
+
posts = await self.fetch_all_rows('posts', '*', self.last_update, False)
|
| 233 |
+
post_texts, post_ids = [], []
|
| 234 |
+
for post in posts:
|
| 235 |
+
post_id = post['id']
|
| 236 |
+
text = f"{post.get('movie_name', '')} {post.get('content', '')}".strip(
|
| 237 |
+
)
|
| 238 |
+
post_texts.append(text)
|
| 239 |
+
post_ids.append(post_id)
|
| 240 |
+
post['created_at'] = parse_datetime(post['created_at'])
|
| 241 |
+
post['type'] = 'post'
|
| 242 |
+
post_metadata[post_id] = post
|
| 243 |
+
|
| 244 |
+
if post_texts:
|
| 245 |
+
embeddings = sentence_model.encode(
|
| 246 |
+
post_texts, show_progress_bar=False, convert_to_numpy=True)
|
| 247 |
+
for post_id, embedding in zip(post_ids, embeddings):
|
| 248 |
+
post_features[post_id] = embedding / np.linalg.norm(embedding)
|
| 249 |
+
|
| 250 |
+
self.last_update = datetime.now(TIMEZONE)
|
| 251 |
+
total_interactions = len(likes) + len(comments) + \
|
| 252 |
+
len(commentlikes) + len(replies) + len(replylikes)
|
| 253 |
+
logger.info(
|
| 254 |
+
f"Data updated: {len(posts)} posts, {total_interactions} interactions")
|
| 255 |
+
|
| 256 |
+
def get_recommendations(self, user_id: str) -> List[Dict]:
|
| 257 |
+
user_profile = self.user_profiles[user_id]
|
| 258 |
+
seen_posts = user_interactions[user_id]
|
| 259 |
+
scores = {}
|
| 260 |
+
now = datetime.now(TIMEZONE)
|
| 261 |
+
|
| 262 |
+
for post_id, feature in post_features.items():
|
| 263 |
+
if post_id in seen_posts:
|
| 264 |
+
continue
|
| 265 |
+
|
| 266 |
+
sim_score = np.dot(user_profile, feature) if np.any(
|
| 267 |
+
user_profile) else 0
|
| 268 |
+
time_diff = now - post_metadata[post_id]['created_at']
|
| 269 |
+
# Freshness is defined as an exponential decay based on weeks old
|
| 270 |
+
freshness = math.exp(-time_diff.days / 7.0)
|
| 271 |
+
score = compute_score(
|
| 272 |
+
sim_score, self.post_popularity[post_id], freshness)
|
| 273 |
+
# Adding a small random noise for exploration
|
| 274 |
+
scores[post_id] = score + random.uniform(-0.1, 0.1)
|
| 275 |
+
|
| 276 |
+
top_posts = sorted(
|
| 277 |
+
scores.items(), key=lambda x: x[1], reverse=True)[:TOP_K]
|
| 278 |
+
results = [post_metadata[post_id] for post_id, _ in top_posts]
|
| 279 |
+
random.shuffle(results)
|
| 280 |
+
return results
|
| 281 |
+
|
| 282 |
+
|
| 283 |
+
recommender = Recommender()
|
| 284 |
+
|
| 285 |
+
|
| 286 |
+
async def main():
|
| 287 |
+
# pdb.set_trace()
|
| 288 |
+
user_id = "d7411324-c8ea-42cb-ae59-7b8cbc61594c"
|
| 289 |
+
await recommender.update_data()
|
| 290 |
+
recommendations = recommender.get_recommendations(user_id)
|
| 291 |
+
print(f"Recommendations for user {user_id}:")
|
| 292 |
+
for post in recommendations:
|
| 293 |
+
print(
|
| 294 |
+
f"- Post {post['id']}: Movie: {post.get('movie_name', '')} Caption: {post.get('content', '')}")
|
| 295 |
+
|
| 296 |
+
if __name__ == "__main__":
|
| 297 |
+
asyncio.run(main())
|