Spaces:

SerZak05
/

SentimentMap

Sleeping

App Files Files Community

SerZak05 commited on 13 days ago

Commit

538569e

1 Parent(s): c75ee0b

Initial version

Browse files

Files changed (9) hide show

requirements.txt +8 -2
src/__init__.py +2 -0
src/__pycache__/__init__.cpython-313.pyc +0 -0
src/__pycache__/post_assesment.cpython-313.pyc +0 -0
src/__pycache__/post_search.cpython-313.pyc +0 -0
src/app.py +75 -0
src/post_assesment.py +33 -0
src/post_search.py +145 -0
towns.csv +0 -0

requirements.txt CHANGED Viewed

@@ -1,3 +1,9 @@
-altair
 pandas
-streamlit

 pandas
+streamlit
+transformers
+vk_api
+dotenv
+numpy
+joblib
+folium
+streamlit_folium

src/__init__.py ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ from .post_assesment import get_sentiment
2	+ from .post_search import search_posts

src/__pycache__/__init__.cpython-313.pyc ADDED Viewed

Binary file (257 Bytes). View file

src/__pycache__/post_assesment.cpython-313.pyc ADDED Viewed

Binary file (1.05 kB). View file

src/__pycache__/post_search.cpython-313.pyc ADDED Viewed

Binary file (8.95 kB). View file

src/app.py ADDED Viewed

	@@ -0,0 +1,75 @@

+import streamlit as st
+import folium
+from streamlit_folium import st_folium
+from collections import defaultdict
+# from joblib import Parallel, delayed
+from post_assesment import get_sentiment, emotions
+from post_search import search_posts_parallel
+emotion_to_color = {
+    'no_emotion': "#666666",
+    'joy': "#33cc33",
+    'sadness': "#0066ff",
+    'surprise': "#ff9900",
+    'fear': "#aa2fd6",
+    'anger': "#ff0000"
+}
+POSTS_CNT = 500
+# TOP_CITIES_CNT = 2
+NUM_OF_WORKERS = 8
+# top_cities = cities_db.nlargest(n=TOP_CITIES_CNT, columns="population")
+# === Beginning of the page ===
+st.title("Sentiment analysis")
+topic = st.text_input("Enter your topic:", "котики")
+button = st.button("Start!")
+if button:
+    st.session_state["running"] = True
+    st.session_state.pop("results", None)
+    st.session_state.pop("posts", None)
+if st.session_state.get("running", False):
+    st.text("Processing query...")
+    st.session_state["posts"] = search_posts_parallel(topic, POSTS_CNT)
+    # posts_per_city = Parallel(n_jobs=NUM_OF_WORKERS) \
+    # (
+    #     delayed(search_posts_by_pos)(topic, POSTS_CNT, city_row["city"], city_row["lat"], city_row["lon"])
+    #     for ind, city_row in top_cities.iterrows()
+    # )
+    # posts = [post for city_list in posts_per_city for post in city_list]
+    # print(*[post.owner_id for post in posts], sep='\n', flush=True)
+    # st.session_state["posts"] = posts
+    st.text("Gathered posts...")
+    st.session_state["results"] = get_sentiment(st.session_state["posts"])
+    # st.write(st.session_state["results"])
+    st.session_state["running"] = False
+if "results" in st.session_state:
+    print("Got results!", flush=True)
+    posts = st.session_state["posts"]
+    results = st.session_state["results"]
+    scores = defaultdict(lambda: {e: 0.0 for e in emotions})
+    cnt = defaultdict(int)
+    names = {}
+    for i in range(len(posts)):
+        pos = posts[i].geolocation
+        names[pos] = posts[i].city_of_origin
+        cnt[pos] += 1
+        # for label, score in results[i].items():
+        #     scores[pos][label] = score
+        scores[pos][results[i]["label"]] = results[i]["score"] if results[i]["label"] != "no_emotion" else 0.001
+    colors = {pos: emotion_to_color[max(score, key=score.get)] for pos, score in scores.items()}
+    map_table = {
+        "lon": [pos[0] for pos in cnt.keys()],
+        "lat": [pos[1] for pos in cnt.keys()],
+        "color": colors,
+        "size": cnt.values()
+    }
+    m = folium.Map()
+    for pos in cnt.keys():
+        # print(pos)
+        folium.CircleMarker((float(pos[0]), float(pos[1])), radius=cnt[pos] / POSTS_CNT * 100, color=colors[pos]).add_to(m)
+    st_folium(m, width=725, returned_objects=[])
+    # st.map(map_table, latitude="lat", longitude="lon", color="color", size="size")

src/post_assesment.py ADDED Viewed

	@@ -0,0 +1,33 @@

+# from transformers import AutoTokenizer, AutoModelForSequenceClassification
+from transformers import pipeline
+from post_search import Post
+import streamlit as st
+@st.cache_resource
+def load_model():
+    model_name = "cointegrated/rubert-tiny2-cedr-emotion-detection"
+    # return AutoTokenizer.from_pretrained(model_name), AutoModelForSequenceClassification.from_pretrained(model_name)
+    return pipeline("text-classification", model_name)
+# tokenizer, model = load_model()
+pipe = load_model()
+emotions = ['no_emotion', 'joy', 'sadness', 'surprise', 'fear', 'anger']
+BATCH_SIZE = 64
+# def get_sentiment(posts: list[Post]):
+#     all_texts = [post.text for post in posts]
+#     result = []
+#     for i in range(0, len(all_texts), BATCH_SIZE):
+#         texts = all_texts[i*BATCH_SIZE:(i+1)*BATCH_SIZE]
+#         inputs = tokenizer(texts, padding=True, truncation=True, max_len=512, return_tensors='pt')
+#         print("Got tokens", inputs, flush=True)
+#         output = model(**inputs)
+#         print("Got output", flush=True)
+#         probs = torch.softmax(output['logits'], dim=-1)
+#         print("Got probs", flush=True)
+#         result.extend([{emotion: probs[i, j].item() for j, emotion in enumerate(emotions)} for i in range(len(probs))])
+#     return result
+def get_sentiment(posts: list[Post]):
+    all_texts = [post.text for post in posts]
+    return pipe(all_texts, truncation=True, max_len=2048)

src/post_search.py ADDED Viewed

	@@ -0,0 +1,145 @@

+import vk_api
+from dotenv import load_dotenv
+import os
+from dataclasses import dataclass
+import numpy as np
+import streamlit as st
+import pandas as pd
+from joblib import Parallel, delayed
+load_dotenv()
+@st.cache_resource
+def connect_api():
+    service_token = os.getenv("VK_TOKEN")
+    return vk_api.VkApi(token=service_token).get_api()
+vk = connect_api()
+@st.cache_resource
+def get_cities_db():
+    return pd.read_csv("towns.csv")
+cities_db = get_cities_db()
+@dataclass
+class Post:
+    text: str
+    city_of_origin: str
+    geolocation: tuple[float, float]
+    # likes: int
+    owner_id: int
+    # group_owned: bool = False
+def search_posts(query: str, num_of_posts: int, *, search_args = {}) -> list[Post]:
+    posts: list[Post] = []
+    offset = 0
+    request_count = min(num_of_posts, 200)
+    city_none_stat = 0
+    pos_none_stat = 0
+    while len(posts) < num_of_posts:
+        query_results = vk.newsfeed.search(q=query, count=request_count, offset=offset, **search_args)
+        items = [item for item in query_results["items"] if "owner_id" in item and "text" in item]
+        item_dict = {item["owner_id"]: item for item in items}
+        # print(query_results, items, flush=True)
+        owner_ids = np.array([item["owner_id"] for item in items])
+        cities = get_post_city(owner_ids)
+        city_pos = get_city_position(cities)
+        # likes = item.get("likes", {"count": 0})["count"]
+        for id, pos in city_pos.items():
+            if cities[id] is None:
+                city_none_stat += 1
+                continue
+            if pos is None:
+                pos_none_stat += 1
+                continue
+            posts.append(Post(item_dict[id]["text"], cities[id], city_pos[id], id))
+        offset += request_count
+        print(f"Processed {offset} posts, added {len(posts)}. City not found: {city_none_stat}, position not found: {pos_none_stat}.", flush=True)
+    return posts[:num_of_posts]
+def search_posts_parallel(query: str, num_of_posts: int, num_of_workers: int = 4, *, search_args = {}):
+    posts: list[Post] = []
+    offset = 0
+    request_count = min(num_of_posts // num_of_workers + 1, 200)
+    while len(posts) < num_of_posts:
+        search_res = Parallel(n_jobs=num_of_workers) \
+        (
+            delayed(_get_posts)(query, request_count, offset + i * request_count, search_args) for i in range(num_of_workers)
+        )
+        for p in search_res:
+            posts.extend(p)
+            # print(*[pp.geolocation for pp in p])
+        offset += request_count * num_of_workers
+        print(f"Processed {offset} posts, added {len(posts)}.", flush=True)
+    return posts[:num_of_posts]
+def _get_posts(query: str, request_count: int, offset: int, search_args) -> list[Post]:
+    posts = []
+    query_results = vk.newsfeed.search(q=query, count=request_count, offset=offset, **search_args)
+    items = [item for item in query_results["items"] if "owner_id" in item and "text" in item]
+    item_dict = {item["owner_id"]: item for item in items}
+    # print(query_results, items, flush=True)
+    owner_ids = np.array([item["owner_id"] for item in items])
+    cities = get_post_city(owner_ids)
+    city_pos = get_city_position(cities)
+    # likes = item.get("likes", {"count": 0})["count"]
+    for id, pos in city_pos.items():
+        if pos is None:
+            continue
+        posts.append(Post(item_dict[id]["text"], cities[id], pos, id))
+    assert all(post.geolocation is not None for post in posts)
+    return posts
+def get_post_city(owner_id: np.ndarray) -> dict[int, str | None]:
+    group_ids = -owner_id[owner_id < 0]
+    user_ids = owner_id[owner_id > 0]
+    assert len(group_ids) + len(user_ids) == len(owner_id)
+    # print(group_ids, user_ids, owner_id, flush=True)
+    if len(group_ids) > 0:
+        groups = vk.groups.getById(group_ids=list(group_ids), fields=['city', 'country'])
+        groups_dict = {-group["id"]: group.get("city", None) for group in groups}
+    else:
+        groups_dict = {}
+    if len(user_ids) > 0:
+        users = vk.users.get(user_ids=list(user_ids), fields=['city', 'country'])
+        users_dict = {user["id"]: user.get("city", None) for user in users}
+    else:
+        users_dict = {}
+    users_dict.update(groups_dict)
+    return {id: city["title"] if city is not None else None for id, city in users_dict.items()}
+def get_city_position(cities: dict[int, str | None]) -> dict[int, tuple[float, float] | None]:
+    res = {}
+    for id, city in cities.items():
+        if city is None:
+            res[id] = None
+            continue
+        selected = cities_db[cities_db["city"] == city]
+        if len(selected) == 0:
+            res[id] = None
+            continue
+        # print(selected)
+        res[id] = (selected["lat"].iloc[0], selected["lon"].iloc[0])
+    assert len(cities) == len(res)
+    return res
+def search_posts_by_pos(query: str, num_of_posts: int, city_name: str, lat: float, lon: float) -> list[Post]:
+    posts: list[Post] = []
+    offset = 0
+    request_count = min(num_of_posts, 200)
+    while len(posts) < num_of_posts:
+        query_results = vk.newsfeed.search(q=query, count=request_count, offset=offset, latitude=lat, longtitude=lon)
+        items = [item for item in query_results["items"] if "text" in item]
+        for item in items:
+            posts.append(Post(item["text"], city_name, (lat, lon), item.get("owner_id", 0)))
+        offset += request_count
+        print(f"For city {city_name} processed {offset} posts, added {len(posts)}.", flush=True)
+    return posts[:num_of_posts]

towns.csv ADDED Viewed

The diff for this file is too large to render. See raw diff