k-recipe2vec / backend /src /logic.py
๊ฐ•๋ฏผ๊ท 
Fix: Absolute path for data loading and add import os
c95db27
# logic.py
import streamlit as st
import pandas as pd
import numpy as np
from gensim.models import Word2Vec, Doc2Vec
from ast import literal_eval
import pickle
from datetime import datetime, timedelta, timezone
from supabase import create_client
import re
import os
from collections import Counter
# ==========================================
# 0. ํ™˜๊ฒฝ ์„ค์ • ๋ฐ ๊ทœ์น™ ์ •์˜
# ==========================================
PRICE_KEYWORD_RULES = [
(5, ['์†Œ๊ณ ๊ธฐ', 'ํ•œ์šฐ', '์ฑ„๋', '๋“ฑ์‹ฌ', '์•ˆ์‹ฌ', '๊ฐˆ๋น„์‚ด', '์ „๋ณต', '์žฅ์–ด']),
(4, ['๋ผ์ง€', '์‚ผ๊ฒน', '๋ชฉ์‚ด', '์•ž๋‹ค๋ฆฌ', '๋’ท๋‹ค๋ฆฌ', '๊ฐˆ๋น„', '์˜ค๋ฆฌ', '๋‚™์ง€', '์˜ค์ง•์–ด', '์ƒˆ์šฐ', '๋ช…๋ž€']),
(3, ['๋‹ญ', '์น˜ํ‚จ', 'ํ–„', '์†Œ์‹œ์ง€', '๋ฒ ์ด์ปจ', '์ŠคํŒธ', '์ฐธ์น˜', '๋™์›', '์–ด๋ฌต', '๋ง›์‚ด', '๋ฒ„์„ฏ', '์น˜์ฆˆ']),
(2, ['๋‘๋ถ€', '์ˆœ๋‘๋ถ€', '์ฝฉ๋‚˜๋ฌผ', '์ˆ™์ฃผ', '๊น€์น˜', '๋ฌด', '๊ฐ์ž', '๊ณ ๊ตฌ๋งˆ', '๋‹น๊ทผ', 'ํ˜ธ๋ฐ•']),
(1, ['์–‘ํŒŒ', '๋Œ€ํŒŒ', '์ชฝํŒŒ', '์‹คํŒŒ', '๋งˆ๋Š˜', '๊ณ ์ถ”', '๋ฌผ', '์†Œ๊ธˆ', '์„คํƒ•', '๊ฐ„์žฅ', '์†Œ์Šค', '์–‘๋…', '์œก์ˆ˜'])
]
PRICE_RULE_EXCEPTIONS = ['๋ผ์ง€๊ฐ์ž', '๋‹ญ์˜์žฅํ’€', '์ƒˆ์šฐ์ “', '๋ฉธ์น˜์•ก์ “', '๋‹ค์‹œ๋‹ค']
# ==========================================
# 1. Supabase DB ์—ฐ๋™ ๋ฐ ๋ฐ์ดํ„ฐ ์ €์žฅ/๋กœ๋“œ
# ==========================================
@st.cache_resource
def init_supabase():
try:
url = None
key = None
# 1. Try Streamlit secrets
try:
if hasattr(st, "secrets") and "supabase" in st.secrets:
url = st.secrets["supabase"]["url"]
key = st.secrets["supabase"]["key"]
except:
pass
# 2. Add fallback to OS environment variables
if not url or not key:
url = os.environ.get("SUPABASE_URL")
key = os.environ.get("SUPABASE_KEY")
if not url or not key:
# ๋กœ์ปฌ ๊ฐœ๋ฐœ ์ค‘ secrets ์—†์ด ์‹คํ–‰๋  ๊ฒฝ์šฐ๋ฅผ ๋Œ€๋น„ํ•ด ์Šคํ‚ตํ•˜๊ฑฐ๋‚˜ ์—๋Ÿฌ ์ฒ˜๋ฆฌ
# API ์„œ๋ฒ„์—์„œ๋Š” ํ•„์ˆ˜์ด๋ฏ€๋กœ ๋กœ๊ทธ ๋‚จ๊น€
return None
return create_client(url, key)
except Exception as e:
print(f"Supabase ์—ฐ๊ฒฐ ๊ฒฝ๊ณ : {e}")
return None
def get_kst_now_iso():
kst_timezone = timezone(timedelta(hours=9))
now_kst = datetime.now(kst_timezone)
return now_kst.isoformat()
@st.cache_data(ttl=300)
def load_global_stopwords():
try:
supabase = init_supabase()
response = supabase.table("stopwords").select("word").order("created_at", desc=True).execute()
if response.data:
return [item['word'] for item in response.data]
return []
except Exception as e:
print(f"๋ถˆ์šฉ์–ด ๋กœ๋“œ ์‹คํŒจ: {e}")
return []
@st.cache_data(ttl=600)
def get_usage_stats(timeframe='today'):
try:
supabase = init_supabase()
query = supabase.table("usage_log").select("dish, target")
if timeframe == 'today':
kst = timezone(timedelta(hours=9))
now_kst = datetime.now(kst)
today_start = now_kst.replace(hour=0, minute=0, second=0, microsecond=0)
tomorrow_start = today_start + timedelta(days=1)
query = query.gte("created_at", today_start.isoformat()).lt("created_at", tomorrow_start.isoformat())
response = query.execute()
data = response.data
count = len(data)
top_dishes = pd.Series(dtype=int)
top_targets = pd.Series(dtype=int)
if count > 0:
df_log = pd.DataFrame(data)
df_log['clean_dish'] = df_log['dish'].astype(str).str.replace(r'\[Custom\]', '', regex=True).str.strip()
top_dishes = df_log[df_log['clean_dish'] != '']['clean_dish'].value_counts().head(5)
all_targets = []
for t in df_log['target']:
if t:
all_targets.extend([x.strip() for x in str(t).split(',') if x.strip()])
top_targets = pd.Series(all_targets).value_counts().head(5)
return count, top_dishes, top_targets
except Exception as e:
print(f"ํ†ต๊ณ„ ๋ฐ์ดํ„ฐ ๋กœ๋“œ ์‹คํŒจ ({timeframe}): {e}")
return 0, pd.Series(dtype=int), pd.Series(dtype=int)
@st.cache_data(ttl=600)
def get_wordcloud_text(timeframe='today'):
try:
supabase = init_supabase()
query = supabase.table("usage_log").select("target")
if timeframe == 'today':
kst = timezone(timedelta(hours=9))
now_kst = datetime.now(kst)
today_start = now_kst.replace(hour=0, minute=0, second=0, microsecond=0)
tomorrow_start = today_start + timedelta(days=1)
query = query.gte("created_at", today_start.isoformat()).lt("created_at", tomorrow_start.isoformat())
response = query.execute()
data = response.data
all_targets = []
if data:
for item in data:
if item['target']:
all_targets.extend([x.strip() for x in str(item['target']).split(',') if x.strip()])
return " ".join(all_targets)
except Exception as e:
print(f"์›Œ๋“œํด๋ผ์šฐ๋“œ ๋ฐ์ดํ„ฐ ๋กœ๋“œ ์‹คํŒจ: {e}")
return ""
def save_stopwords_to_db(words_string):
words = [w.strip() for w in words_string.split(',') if w.strip()]
if not words: return False, "์ €์žฅํ•  ๋‹จ์–ด๊ฐ€ ์—†์Šต๋‹ˆ๋‹ค."
supabase = init_supabase()
success_count, duplicate_count, fail_count = 0, 0, 0
for word in words:
try:
supabase.table("stopwords").insert({"word": word}).execute()
success_count += 1
except Exception as e:
if 'duplicate' in str(e).lower(): duplicate_count += 1
else: fail_count += 1
if success_count > 0: st.cache_data.clear()
msg_parts = []
if success_count > 0: msg_parts.append(f"โœ… {success_count}๊ฐœ ์ €์žฅ")
if duplicate_count > 0: msg_parts.append(f"โš ๏ธ {duplicate_count}๊ฐœ ์ค‘๋ณต")
if fail_count > 0: msg_parts.append(f"โŒ {fail_count}๊ฐœ ์‹คํŒจ")
return success_count > 0, ", ".join(msg_parts)
@st.cache_data(ttl=60)
def get_board_messages():
try:
supabase = init_supabase()
response = supabase.table("board").select("*").order("created_at", desc=True).limit(50).execute()
if response.data:
for item in response.data:
dt = datetime.fromisoformat(item['created_at'])
dt_kst = dt + timedelta(hours=9)
item['display_time'] = dt_kst.strftime("%m/%d %H:%M")
return response.data
return []
except Exception as e:
print(f"๊ฒŒ์‹œํŒ ๋กœ๋“œ ์‹คํŒจ: {e}")
return []
def save_board_message(nickname, content):
if not nickname or not content: return False
try:
supabase = init_supabase()
supabase.table("board").insert({"nickname": nickname, "content": content}).execute()
st.cache_data.clear()
return True
except Exception as e:
print(f"๊ฒŒ์‹œํŒ ์ €์žฅ ์‹คํŒจ: {e}")
return False
def save_feedback_to_db(feedback_text):
try:
supabase = init_supabase()
supabase.table("feedback").insert({"content": feedback_text, "created_at": get_kst_now_iso()}).execute()
return True
except Exception as e:
print(f"ํ”ผ๋“œ๋ฐฑ ์ €์žฅ ์—๋Ÿฌ: {e}")
return False
def save_log_to_db(dish, target, stops, w1, w2, w3, w4, rec_list=None, is_custom=False):
try:
supabase = init_supabase()
r1 = rec_list[0] if rec_list and len(rec_list) > 0 else None
r2 = rec_list[1] if rec_list and len(rec_list) > 1 else None
r3 = rec_list[2] if rec_list and len(rec_list) > 2 else None
dish_name_to_save = f"[Custom] {dish}" if is_custom else dish
data = {
"dish": dish_name_to_save, "target": target, "stops": ", ".join(stops) if stops else "์—†์Œ",
"w_w2v": w1, "w_d2v": w2, "w_method": w3, "w_cat": w4, "rec_1": r1, "rec_2": r2, "rec_3": r3,
"created_at": get_kst_now_iso()
}
response = supabase.table("usage_log").insert(data).execute()
if response.data: return response.data[0]['id']
return None
except Exception as e:
print(f"๋กœ๊ทธ ์ €์žฅ ์—๋Ÿฌ: {e}")
return None
def update_feedback_in_db(log_id, status):
try:
supabase = init_supabase()
if log_id:
supabase.table("usage_log").update({"satisfaction": status}).eq("id", log_id).execute()
return True
return False
except Exception as e:
print(f"๋งŒ์กฑ๋„ ์—…๋ฐ์ดํŠธ ์—๋Ÿฌ: {e}")
return False
# ==========================================
# 2. ๋ฐ์ดํ„ฐ ๋ฐ ๋ชจ๋ธ ๋กœ๋“œ
# ==========================================
# ==========================================
# 2. ๋ฐ์ดํ„ฐ ๋ฐ ๋ชจ๋ธ ๋กœ๋“œ (Lazy Loading ์ ์šฉ)
# ==========================================
w2v_model = None
d2v_model = None
df = None
stats = None
price_map = {}
global_stopwords_set = set()
all_ingredients_set = set()
method_map = {}
recipes_by_ingredient = {}
ing_method_counts = {}
ing_cat_counts = {}
total_method_counts = {}
total_cat_counts = {}
TOTAL_RECIPES = 0
def load_resources():
global w2v_model, d2v_model, df, stats, price_map, global_stopwords_set, all_ingredients_set
global method_map, recipes_by_ingredient, ing_method_counts, ing_cat_counts, total_method_counts, total_cat_counts, TOTAL_RECIPES
print("Loading resources... (This may take a while)")
# ๊ธฐ์ค€ ๊ฒฝ๋กœ ์„ค์ • (ํ˜„์žฌ ํŒŒ์ผ ์œ„์น˜ logic.py ๊ธฐ์ค€ ์ƒ์œ„ ํด๋”)
base_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
# mmap='r' ์˜ต์…˜์œผ๋กœ ๋ฉ”๋ชจ๋ฆฌ ์‚ฌ์šฉ๋Ÿ‰ ์ตœ์†Œํ™” (๋””์Šคํฌ์—์„œ ์ง์ ‘ ์ฝ์Œ)
w2v_model = Word2Vec.load(os.path.join(base_dir, "models/w2v.model"), mmap='r')
d2v_model = Doc2Vec.load(os.path.join(base_dir, "models/d2v.model"), mmap='r')
df_temp = pd.read_csv(os.path.join(base_dir, "data/final_recipe_data.csv"))
df_temp['์žฌ๋ฃŒํ† ํฐ'] = df_temp['์žฌ๋ฃŒํ† ํฐ'].apply(literal_eval)
df = df_temp # Assign to global
with open(os.path.join(base_dir, "data/stats.pkl"), "rb") as f:
stats = pickle.load(f)
try:
price_df = pd.read_csv(os.path.join(base_dir, "data/price_rank.csv"), encoding='utf-8-sig')
price_df.columns = price_df.columns.str.strip()
price_map = dict(zip(price_df['ingredient'], price_df['rank']))
except:
price_map = {}
global_stopwords_set = set()
all_ingredients_set = set()
for ings in df['์žฌ๋ฃŒํ† ํฐ']:
all_ingredients_set.update(ings)
# Stats unpacking
method_map = stats["method_map"]
recipes_by_ingredient = stats["recipes_by_ingredient"]
ing_method_counts = stats["ing_method_counts"]
ing_cat_counts = stats["ing_cat_counts"]
total_method_counts = stats["total_method_counts"]
total_cat_counts = stats["total_cat_counts"]
TOTAL_RECIPES = stats["TOTAL_RECIPES"]
print("Resources loaded successfully!")
def ensure_initialized():
if df is None:
load_resources()
# ๊ธฐ์กด ์ฆ‰์‹œ ์‹คํ–‰ ์ฝ”๋“œ ์ œ๊ฑฐ
# w2v_model, d2v_model, ... = load_resources()
# ==========================================
# 3. ํ•ต์‹ฌ ๊ณ„์‚ฐ ๋กœ์ง
# ==========================================
def cos_sim(vec_a, vec_b):
norm = (np.linalg.norm(vec_a) * np.linalg.norm(vec_b) + 1e-9)
return max(0.0, float(np.dot(vec_a, vec_b) / norm))
def get_stat_score(ingredient, target_key, ing_count_dict, total_count_dict, total_n, min_count=5):
cnts = ing_count_dict.get(ingredient)
if not cnts: return 0.0
ing_target_count = cnts[target_key]
ing_total_count = sum(cnts.values())
if ing_total_count < min_count: return 0.0
prob_ing_context = ing_target_count / ing_total_count
baseline_prob = total_count_dict[target_key] / total_n
if baseline_prob == 0: return 0.0
return prob_ing_context / baseline_prob
def get_estimated_price_rank(ing_name, price_map):
if ing_name in price_map: return price_map[ing_name]
if any(exp in ing_name for exp in PRICE_RULE_EXCEPTIONS): return 3
for rank, keywords in PRICE_KEYWORD_RULES:
if any(kw in ing_name for kw in keywords): return rank
return 3
# ==========================================
# 4. ๋Œ€์ฒด ์ถ”์ฒœ ์•Œ๊ณ ๋ฆฌ์ฆ˜ (DB ๊ธฐ๋ฐ˜)
# ==========================================
def substitute_single(recipe_id, target_ing, user_stopwords, w_w2v, w_d2v, w_method, w_cat, topn=10):
row = df[df['๋ ˆ์‹œํ”ผ์ผ๋ จ๋ฒˆํ˜ธ'] == recipe_id].iloc[0]
current_method = row['์š”๋ฆฌ๋ฐฉ๋ฒ•๋ณ„๋ช…']
current_cat = row['์š”๋ฆฌ์ข…๋ฅ˜๋ณ„๋ช…_์„ธ๋ถ„ํ™”']
context_ings = row['์žฌ๋ฃŒํ† ํฐ']
tag = f"recipe_{recipe_id}"
if target_ing not in w2v_model.wv: return pd.DataFrame()
total_weight = w_w2v + w_d2v + w_method + w_cat
if total_weight == 0: total_weight = 1.0
vec_recipe = None
if w_d2v > 0 and tag in d2v_model.dv: vec_recipe = d2v_model.dv[tag]
target_rank = get_estimated_price_rank(target_ing, price_map)
candidates_raw = w2v_model.wv.most_similar(target_ing, topn=50)
temp_results = []
seen_candidates = set()
# [์ˆ˜์ •] ์‹ค์‹œ๊ฐ„ ๋กœ๋“œ
global_stopwords_set = set(load_global_stopwords())
final_stopwords = set(user_stopwords) | global_stopwords_set
for cand, score_w2v in candidates_raw:
clean_cand = cand
if final_stopwords:
for stop in final_stopwords: clean_cand = clean_cand.replace(stop, "")
clean_cand = clean_cand.strip()
if not clean_cand: continue
if clean_cand in final_stopwords: continue
if clean_cand in context_ings: continue
if clean_cand == target_ing: continue
if clean_cand not in w2v_model.wv: continue
if clean_cand in seen_candidates: continue
seen_candidates.add(clean_cand)
real_score_w2v = w2v_model.wv.similarity(target_ing, clean_cand)
s_w2v = max(0.0, real_score_w2v)
if s_w2v < 0.35: continue
s_d2v = 0.0
if w_d2v > 0 and vec_recipe is not None:
rid_list = recipes_by_ingredient.get(clean_cand, [])
same_method_ids = [r for r in rid_list if method_map.get(r) == current_method]
if len(same_method_ids) > 20:
np.random.seed(42)
same_method_ids = np.random.choice(same_method_ids, 20, replace=False)
if same_method_ids is not None and len(same_method_ids) > 0:
sims = []
for r in same_method_ids:
rt = f"recipe_{r}"
if rt in d2v_model.dv: sims.append(cos_sim(vec_recipe, d2v_model.dv[rt]))
if sims: s_d2v = np.mean(sims)
s_method = 0.0 if w_method <= 0 else get_stat_score(clean_cand, current_method, ing_method_counts, total_method_counts, TOTAL_RECIPES)
s_cat = 0.0 if w_cat <= 0 else get_stat_score(clean_cand, current_cat, ing_cat_counts, total_cat_counts, TOTAL_RECIPES)
cand_rank = get_estimated_price_rank(clean_cand, price_map)
saving_score = target_rank - cand_rank
temp_results.append({"๋Œ€์ฒด์žฌ๋ฃŒ": clean_cand, "raw_W2V": s_w2v, "raw_D2V": s_d2v, "raw_Method": s_method, "raw_Category": s_cat, "saving_score": saving_score})
if not temp_results: return pd.DataFrame()
df_res = pd.DataFrame(temp_results)
cols = ["raw_W2V", "raw_D2V", "raw_Method", "raw_Category"]
norm_cols = ["W2V", "D2V", "Method", "Category"]
for raw_col, norm_col in zip(cols, norm_cols):
min_val = df_res[raw_col].min()
max_val = df_res[raw_col].max()
if max_val - min_val == 0: df_res[norm_col] = 0.5
else: df_res[norm_col] = (df_res[raw_col] - min_val) / (max_val - min_val)
df_res["์ตœ์ข…์ ์ˆ˜"] = ((df_res["W2V"]*w_w2v) + (df_res["D2V"]*w_d2v) + (df_res["Method"]*w_method) + (df_res["Category"]*w_cat)) / total_weight
return df_res.sort_values("์ตœ์ข…์ ์ˆ˜", ascending=False).head(topn).reset_index(drop=True)
def substitute_multi(recipe_id, targets, user_stopwords, w_w2v, w_d2v, w_method, w_cat, beam_width=3, result_topn=3):
row = df[df['๋ ˆ์‹œํ”ผ์ผ๋ จ๋ฒˆํ˜ธ'] == recipe_id].iloc[0]
current_method = row['์š”๋ฆฌ๋ฐฉ๋ฒ•๋ณ„๋ช…']
current_cat = row['์š”๋ฆฌ์ข…๋ฅ˜๋ณ„๋ช…_์„ธ๋ถ„ํ™”']
initial_context = row['์žฌ๋ฃŒํ† ํฐ']
tag = f"recipe_{recipe_id}"
vec_recipe = None
if w_d2v > 0 and tag in d2v_model.dv: vec_recipe = d2v_model.dv[tag]
total_weight = w_w2v + w_d2v + w_method + w_cat
if total_weight == 0: total_weight = 1.0
target_ranks_sum = 0
for t in targets: target_ranks_sum += get_estimated_price_rank(t, price_map)
# [์ˆ˜์ •] ์‹ค์‹œ๊ฐ„ ๋กœ๋“œ
global_stopwords_set = set(load_global_stopwords())
final_stopwords = set(user_stopwords) | global_stopwords_set
beam = [(0.0, [], initial_context)]
for target_ing in targets:
next_beam = []
if target_ing not in w2v_model.wv:
for score, subs, ctx in beam: next_beam.append((score, subs + [target_ing], ctx))
beam = next_beam
continue
for path_score, path_subs, path_ctx in beam:
current_ctx_ing = [x for x in path_ctx if x != target_ing]
candidates = w2v_model.wv.most_similar(target_ing, topn=30)
temp_candidates = []
seen_candidates = set()
for cand, _ in candidates:
clean_cand = cand
if final_stopwords:
for stop in final_stopwords: clean_cand = clean_cand.replace(stop, "")
clean_cand = clean_cand.strip()
if not clean_cand: continue
if clean_cand in final_stopwords: continue
if clean_cand in current_ctx_ing or clean_cand in path_subs: continue
if clean_cand == target_ing: continue
if clean_cand not in w2v_model.wv: continue
if clean_cand in seen_candidates: continue
seen_candidates.add(clean_cand)
sim_orig = w2v_model.wv.similarity(target_ing, clean_cand)
sim_orig = max(0.0, sim_orig)
if sim_orig < 0.3: continue
harmony_scores = [w2v_model.wv.similarity(clean_cand, c) for c in current_ctx_ing if c in w2v_model.wv]
sim_harmony = np.mean(harmony_scores) if harmony_scores else 0.0
s_w2v = 0.5 * sim_orig + 0.5 * max(0.0, sim_harmony)
s_d2v = 0.0
if vec_recipe is not None:
rid_list = recipes_by_ingredient.get(clean_cand, [])
same_method_ids = [r for r in rid_list if method_map.get(r) == current_method]
if len(same_method_ids) > 10:
np.random.seed(42)
same_method_ids = np.random.choice(same_method_ids, 10, replace=False)
if same_method_ids is not None and len(same_method_ids) > 0:
sims = []
for r in same_method_ids:
rt = f"recipe_{r}"
if rt in d2v_model.dv: sims.append(cos_sim(vec_recipe, d2v_model.dv[rt]))
if sims: s_d2v = np.mean(sims)
s_method = 0.0 if w_method <= 0 else get_stat_score(clean_cand, current_method, ing_method_counts, total_method_counts, TOTAL_RECIPES)
s_cat = 0.0 if w_cat <= 0 else get_stat_score(clean_cand, current_cat, ing_cat_counts, total_cat_counts, TOTAL_RECIPES)
temp_candidates.append({"cand": clean_cand, "raw_w2v": s_w2v, "raw_d2v": s_d2v, "raw_method": s_method, "raw_cat": s_cat})
if not temp_candidates: continue
df_temp = pd.DataFrame(temp_candidates)
cols = ["raw_w2v", "raw_d2v", "raw_method", "raw_cat"]
for col in cols:
min_val = df_temp[col].min()
max_val = df_temp[col].max()
if max_val - min_val == 0: df_temp[col + "_norm"] = 0.5
else: df_temp[col + "_norm"] = (df_temp[col] - min_val) / (max_val - min_val)
for _, r in df_temp.iterrows():
weighted_sum = ((r["raw_w2v_norm"]*w_w2v) + (r["raw_d2v_norm"]*w_d2v) + (r["raw_method_norm"]*w_method) + (r["raw_cat_norm"]*w_cat)) / total_weight
new_total_score = path_score + weighted_sum
new_subs = path_subs + [r["cand"]]
new_ctx = current_ctx_ing + [r["cand"]]
next_beam.append((new_total_score, new_subs, new_ctx))
next_beam.sort(key=lambda x: x[0], reverse=True)
beam = next_beam[:beam_width]
final_results = []
for score, subs, _ in beam:
avg_score = score / len(targets) if targets else 0.0
cand_ranks_sum = 0
for sub_ing in subs: cand_ranks_sum += get_estimated_price_rank(sub_ing, price_map)
total_saving_score = target_ranks_sum - cand_ranks_sum
final_results.append((subs, avg_score, total_saving_score))
return final_results[:result_topn]
# ==========================================
# 5. ์ปค์Šคํ…€ ์ž…๋ ฅ ๊ธฐ๋ฐ˜ ๋Œ€์ฒด ์•Œ๊ณ ๋ฆฌ์ฆ˜ (์ˆ˜์ •๋จ)
# ==========================================
def substitute_single_custom(target_ing, context_ings_list, user_stopwords, w_w2v, w_d2v, excluded_ings=None, topn=10):
if target_ing not in w2v_model.wv: return pd.DataFrame()
total_weight = w_w2v + w_d2v
if total_weight == 0: total_weight = 1.0
vec_custom_context = None
if w_d2v > 0:
valid_context = [word for word in context_ings_list if word in d2v_model.wv]
if valid_context: vec_custom_context = d2v_model.infer_vector(valid_context)
target_rank = get_estimated_price_rank(target_ing, price_map)
candidates_raw = w2v_model.wv.most_similar(target_ing, topn=50)
temp_results = []
seen_candidates = set()
# [์ˆ˜์ •] ์‹ค์‹œ๊ฐ„ ๋กœ๋“œ
global_stopwords_set = set(load_global_stopwords())
final_stopwords = set(user_stopwords) | global_stopwords_set
excluded_set = set(excluded_ings) if excluded_ings else set()
for cand, score_w2v in candidates_raw:
clean_cand = cand
if final_stopwords:
for stop in final_stopwords: clean_cand = clean_cand.replace(stop, "")
clean_cand = clean_cand.strip()
if not clean_cand: continue
if clean_cand in final_stopwords: continue
if clean_cand in excluded_set: continue
if clean_cand in context_ings_list: continue
if clean_cand == target_ing: continue
if clean_cand not in w2v_model.wv: continue
if clean_cand in seen_candidates: continue
seen_candidates.add(clean_cand)
real_score_w2v = w2v_model.wv.similarity(target_ing, clean_cand)
s_w2v = max(0.0, real_score_w2v)
if s_w2v < 0.35: continue
s_d2v = 0.0
if w_d2v > 0 and vec_custom_context is not None:
rid_list = recipes_by_ingredient.get(clean_cand, [])
if len(rid_list) > 20:
np.random.seed(42)
rid_list = np.random.choice(rid_list, 20, replace=False)
if rid_list is not None and len(rid_list) > 0:
sims = []
for r in rid_list:
rt = f"recipe_{r}"
if rt in d2v_model.dv: sims.append(cos_sim(vec_custom_context, d2v_model.dv[rt]))
if sims: s_d2v = np.mean(sims)
s_method, s_cat = 0.0, 0.0
cand_rank = get_estimated_price_rank(clean_cand, price_map)
saving_score = target_rank - cand_rank
temp_results.append({"๋Œ€์ฒด์žฌ๋ฃŒ": clean_cand, "raw_W2V": s_w2v, "raw_D2V": s_d2v, "raw_Method": s_method, "raw_Category": s_cat, "saving_score": saving_score})
if not temp_results: return pd.DataFrame()
df_res = pd.DataFrame(temp_results)
cols = ["raw_W2V", "raw_D2V"]
norm_cols = ["W2V", "D2V"]
for raw_col, norm_col in zip(cols, norm_cols):
min_val = df_res[raw_col].min()
max_val = df_res[raw_col].max()
if max_val - min_val == 0: df_res[norm_col] = 0.5
else: df_res[norm_col] = (df_res[raw_col] - min_val) / (max_val - min_val)
df_res["์ตœ์ข…์ ์ˆ˜"] = ((df_res["W2V"]*w_w2v) + (df_res["D2V"]*w_d2v)) / total_weight
return df_res.sort_values("์ตœ์ข…์ ์ˆ˜", ascending=False).head(topn).reset_index(drop=True)
def substitute_multi_custom(targets, context_ings_list, user_stopwords, w_w2v, w_d2v, excluded_ings=None, beam_width=3, result_topn=3):
total_weight = w_w2v + w_d2v
if total_weight == 0: total_weight = 1.0
vec_custom_context = None
if w_d2v > 0:
valid_context = [word for word in context_ings_list if word in d2v_model.wv]
if valid_context: vec_custom_context = d2v_model.infer_vector(valid_context)
target_ranks_sum = 0
for t in targets: target_ranks_sum += get_estimated_price_rank(t, price_map)
# [์ˆ˜์ •] ์‹ค์‹œ๊ฐ„ ๋กœ๋“œ
global_stopwords_set = set(load_global_stopwords())
final_stopwords = set(user_stopwords) | global_stopwords_set
excluded_set = set(excluded_ings) if excluded_ings else set()
beam = [(0.0, [], context_ings_list)]
for target_ing in targets:
next_beam = []
if target_ing not in w2v_model.wv:
for score, subs, ctx in beam: next_beam.append((score, subs + [target_ing], ctx))
beam = next_beam
continue
for path_score, path_subs, path_ctx in beam:
current_ctx_ing = [x for x in path_ctx if x != target_ing]
candidates = w2v_model.wv.most_similar(target_ing, topn=30)
temp_candidates = []
seen_candidates = set()
for cand, _ in candidates:
clean_cand = cand
if final_stopwords:
for stop in final_stopwords: clean_cand = clean_cand.replace(stop, "")
clean_cand = clean_cand.strip()
if not clean_cand: continue
if clean_cand in final_stopwords: continue
if clean_cand in excluded_set: continue
if clean_cand in current_ctx_ing or clean_cand in path_subs: continue
if clean_cand == target_ing: continue
if clean_cand not in w2v_model.wv: continue
if clean_cand in seen_candidates: continue
seen_candidates.add(clean_cand)
sim_orig = w2v_model.wv.similarity(target_ing, clean_cand)
sim_orig = max(0.0, sim_orig)
if sim_orig < 0.3: continue
harmony_scores = [w2v_model.wv.similarity(clean_cand, c) for c in current_ctx_ing if c in w2v_model.wv]
sim_harmony = np.mean(harmony_scores) if harmony_scores else 0.0
s_w2v = 0.5 * sim_orig + 0.5 * max(0.0, sim_harmony)
s_d2v = 0.0
if w_d2v > 0:
valid_path_ctx = [word for word in current_ctx_ing if word in d2v_model.wv]
if valid_path_ctx:
vec_path_context = d2v_model.infer_vector(valid_path_ctx)
rid_list = recipes_by_ingredient.get(clean_cand, [])
if len(rid_list) > 10:
np.random.seed(42)
rid_list = np.random.choice(rid_list, 10, replace=False)
if rid_list is not None and len(rid_list) > 0:
sims = []
for r in rid_list:
rt = f"recipe_{r}"
if rt in d2v_model.dv: sims.append(cos_sim(vec_path_context, d2v_model.dv[rt]))
if sims: s_d2v = np.mean(sims)
s_method, s_cat = 0.0, 0.0
temp_candidates.append({"cand": clean_cand, "raw_w2v": s_w2v, "raw_d2v": s_d2v})
if not temp_candidates: continue
df_temp = pd.DataFrame(temp_candidates)
cols = ["raw_w2v", "raw_d2v"]
for col in cols:
min_val = df_temp[col].min()
max_val = df_temp[col].max()
if max_val - min_val == 0: df_temp[col + "_norm"] = 0.5
else: df_temp[col + "_norm"] = (df_temp[col] - min_val) / (max_val - min_val)
for _, r in df_temp.iterrows():
weighted_sum = ((r["raw_w2v_norm"]*w_w2v) + (r["raw_d2v_norm"]*w_d2v)) / total_weight
new_total_score = path_score + weighted_sum
new_subs = path_subs + [r["cand"]]
new_ctx = current_ctx_ing + [r["cand"]]
next_beam.append((new_total_score, new_subs, new_ctx))
next_beam.sort(key=lambda x: x[0], reverse=True)
beam = next_beam[:beam_width]
final_results = []
for score, subs, _ in beam:
avg_score = score / len(targets) if targets else 0.0
cand_ranks_sum = 0
for sub_ing in subs: cand_ranks_sum += get_estimated_price_rank(sub_ing, price_map)
total_saving_score = target_ranks_sum - cand_ranks_sum
final_results.append((subs, avg_score, total_saving_score))
return final_results[:result_topn]
# ==========================================
# 6. ์žฌ๋ฃŒ ํ‚ค์›Œ๋“œ ๊ธฐ๋ฐ˜ ๋ ˆ์‹œํ”ผ ๊ฒ€์ƒ‰ (๊ธฐ์กด๊ณผ ๋™์ผ)
# ==========================================
def find_recipes_by_ingredient_keyword(keyword, topn=5):
keyword = keyword.strip()
if not keyword: return []
matched_dishes = set()
for _, row in df.iterrows():
for ing in row['์žฌ๋ฃŒํ† ํฐ']:
if keyword in ing:
matched_dishes.add(row['์š”๋ฆฌ๋ช…'])
break
return list(matched_dishes)[:topn]