SerZak05 commited on
Commit
538569e
·
1 Parent(s): c75ee0b

Initial version

Browse files
requirements.txt CHANGED
@@ -1,3 +1,9 @@
1
- altair
2
  pandas
3
- streamlit
 
 
 
 
 
 
 
 
 
1
  pandas
2
+ streamlit
3
+ transformers
4
+ vk_api
5
+ dotenv
6
+ numpy
7
+ joblib
8
+ folium
9
+ streamlit_folium
src/__init__.py ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ from .post_assesment import get_sentiment
2
+ from .post_search import search_posts
src/__pycache__/__init__.cpython-313.pyc ADDED
Binary file (257 Bytes). View file
 
src/__pycache__/post_assesment.cpython-313.pyc ADDED
Binary file (1.05 kB). View file
 
src/__pycache__/post_search.cpython-313.pyc ADDED
Binary file (8.95 kB). View file
 
src/app.py ADDED
@@ -0,0 +1,75 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import folium
3
+ from streamlit_folium import st_folium
4
+ from collections import defaultdict
5
+ # from joblib import Parallel, delayed
6
+ from post_assesment import get_sentiment, emotions
7
+ from post_search import search_posts_parallel
8
+
9
+ emotion_to_color = {
10
+ 'no_emotion': "#666666",
11
+ 'joy': "#33cc33",
12
+ 'sadness': "#0066ff",
13
+ 'surprise': "#ff9900",
14
+ 'fear': "#aa2fd6",
15
+ 'anger': "#ff0000"
16
+ }
17
+
18
+ POSTS_CNT = 500
19
+ # TOP_CITIES_CNT = 2
20
+ NUM_OF_WORKERS = 8
21
+ # top_cities = cities_db.nlargest(n=TOP_CITIES_CNT, columns="population")
22
+
23
+ # === Beginning of the page ===
24
+ st.title("Sentiment analysis")
25
+ topic = st.text_input("Enter your topic:", "котики")
26
+ button = st.button("Start!")
27
+
28
+ if button:
29
+ st.session_state["running"] = True
30
+ st.session_state.pop("results", None)
31
+ st.session_state.pop("posts", None)
32
+
33
+ if st.session_state.get("running", False):
34
+ st.text("Processing query...")
35
+ st.session_state["posts"] = search_posts_parallel(topic, POSTS_CNT)
36
+ # posts_per_city = Parallel(n_jobs=NUM_OF_WORKERS) \
37
+ # (
38
+ # delayed(search_posts_by_pos)(topic, POSTS_CNT, city_row["city"], city_row["lat"], city_row["lon"])
39
+ # for ind, city_row in top_cities.iterrows()
40
+ # )
41
+ # posts = [post for city_list in posts_per_city for post in city_list]
42
+ # print(*[post.owner_id for post in posts], sep='\n', flush=True)
43
+ # st.session_state["posts"] = posts
44
+ st.text("Gathered posts...")
45
+ st.session_state["results"] = get_sentiment(st.session_state["posts"])
46
+ # st.write(st.session_state["results"])
47
+ st.session_state["running"] = False
48
+
49
+ if "results" in st.session_state:
50
+ print("Got results!", flush=True)
51
+ posts = st.session_state["posts"]
52
+ results = st.session_state["results"]
53
+ scores = defaultdict(lambda: {e: 0.0 for e in emotions})
54
+ cnt = defaultdict(int)
55
+ names = {}
56
+ for i in range(len(posts)):
57
+ pos = posts[i].geolocation
58
+ names[pos] = posts[i].city_of_origin
59
+ cnt[pos] += 1
60
+ # for label, score in results[i].items():
61
+ # scores[pos][label] = score
62
+ scores[pos][results[i]["label"]] = results[i]["score"] if results[i]["label"] != "no_emotion" else 0.001
63
+ colors = {pos: emotion_to_color[max(score, key=score.get)] for pos, score in scores.items()}
64
+ map_table = {
65
+ "lon": [pos[0] for pos in cnt.keys()],
66
+ "lat": [pos[1] for pos in cnt.keys()],
67
+ "color": colors,
68
+ "size": cnt.values()
69
+ }
70
+ m = folium.Map()
71
+ for pos in cnt.keys():
72
+ # print(pos)
73
+ folium.CircleMarker((float(pos[0]), float(pos[1])), radius=cnt[pos] / POSTS_CNT * 100, color=colors[pos]).add_to(m)
74
+ st_folium(m, width=725, returned_objects=[])
75
+ # st.map(map_table, latitude="lat", longitude="lon", color="color", size="size")
src/post_assesment.py ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # from transformers import AutoTokenizer, AutoModelForSequenceClassification
2
+ from transformers import pipeline
3
+ from post_search import Post
4
+ import streamlit as st
5
+
6
+ @st.cache_resource
7
+ def load_model():
8
+ model_name = "cointegrated/rubert-tiny2-cedr-emotion-detection"
9
+ # return AutoTokenizer.from_pretrained(model_name), AutoModelForSequenceClassification.from_pretrained(model_name)
10
+ return pipeline("text-classification", model_name)
11
+
12
+ # tokenizer, model = load_model()
13
+ pipe = load_model()
14
+ emotions = ['no_emotion', 'joy', 'sadness', 'surprise', 'fear', 'anger']
15
+ BATCH_SIZE = 64
16
+
17
+ # def get_sentiment(posts: list[Post]):
18
+ # all_texts = [post.text for post in posts]
19
+ # result = []
20
+ # for i in range(0, len(all_texts), BATCH_SIZE):
21
+ # texts = all_texts[i*BATCH_SIZE:(i+1)*BATCH_SIZE]
22
+ # inputs = tokenizer(texts, padding=True, truncation=True, max_len=512, return_tensors='pt')
23
+ # print("Got tokens", inputs, flush=True)
24
+ # output = model(**inputs)
25
+ # print("Got output", flush=True)
26
+ # probs = torch.softmax(output['logits'], dim=-1)
27
+ # print("Got probs", flush=True)
28
+ # result.extend([{emotion: probs[i, j].item() for j, emotion in enumerate(emotions)} for i in range(len(probs))])
29
+ # return result
30
+
31
+ def get_sentiment(posts: list[Post]):
32
+ all_texts = [post.text for post in posts]
33
+ return pipe(all_texts, truncation=True, max_len=2048)
src/post_search.py ADDED
@@ -0,0 +1,145 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import vk_api
2
+ from dotenv import load_dotenv
3
+ import os
4
+ from dataclasses import dataclass
5
+ import numpy as np
6
+ import streamlit as st
7
+ import pandas as pd
8
+ from joblib import Parallel, delayed
9
+
10
+ load_dotenv()
11
+
12
+ @st.cache_resource
13
+ def connect_api():
14
+ service_token = os.getenv("VK_TOKEN")
15
+ return vk_api.VkApi(token=service_token).get_api()
16
+
17
+ vk = connect_api()
18
+
19
+ @st.cache_resource
20
+ def get_cities_db():
21
+ return pd.read_csv("towns.csv")
22
+
23
+ cities_db = get_cities_db()
24
+
25
+
26
+ @dataclass
27
+ class Post:
28
+ text: str
29
+ city_of_origin: str
30
+ geolocation: tuple[float, float]
31
+ # likes: int
32
+ owner_id: int
33
+ # group_owned: bool = False
34
+
35
+ def search_posts(query: str, num_of_posts: int, *, search_args = {}) -> list[Post]:
36
+ posts: list[Post] = []
37
+ offset = 0
38
+ request_count = min(num_of_posts, 200)
39
+ city_none_stat = 0
40
+ pos_none_stat = 0
41
+ while len(posts) < num_of_posts:
42
+ query_results = vk.newsfeed.search(q=query, count=request_count, offset=offset, **search_args)
43
+ items = [item for item in query_results["items"] if "owner_id" in item and "text" in item]
44
+ item_dict = {item["owner_id"]: item for item in items}
45
+ # print(query_results, items, flush=True)
46
+ owner_ids = np.array([item["owner_id"] for item in items])
47
+ cities = get_post_city(owner_ids)
48
+ city_pos = get_city_position(cities)
49
+ # likes = item.get("likes", {"count": 0})["count"]
50
+ for id, pos in city_pos.items():
51
+ if cities[id] is None:
52
+ city_none_stat += 1
53
+ continue
54
+ if pos is None:
55
+ pos_none_stat += 1
56
+ continue
57
+ posts.append(Post(item_dict[id]["text"], cities[id], city_pos[id], id))
58
+ offset += request_count
59
+ print(f"Processed {offset} posts, added {len(posts)}. City not found: {city_none_stat}, position not found: {pos_none_stat}.", flush=True)
60
+ return posts[:num_of_posts]
61
+
62
+
63
+ def search_posts_parallel(query: str, num_of_posts: int, num_of_workers: int = 4, *, search_args = {}):
64
+ posts: list[Post] = []
65
+ offset = 0
66
+ request_count = min(num_of_posts // num_of_workers + 1, 200)
67
+ while len(posts) < num_of_posts:
68
+ search_res = Parallel(n_jobs=num_of_workers) \
69
+ (
70
+ delayed(_get_posts)(query, request_count, offset + i * request_count, search_args) for i in range(num_of_workers)
71
+ )
72
+ for p in search_res:
73
+ posts.extend(p)
74
+ # print(*[pp.geolocation for pp in p])
75
+ offset += request_count * num_of_workers
76
+ print(f"Processed {offset} posts, added {len(posts)}.", flush=True)
77
+ return posts[:num_of_posts]
78
+
79
+
80
+ def _get_posts(query: str, request_count: int, offset: int, search_args) -> list[Post]:
81
+ posts = []
82
+ query_results = vk.newsfeed.search(q=query, count=request_count, offset=offset, **search_args)
83
+ items = [item for item in query_results["items"] if "owner_id" in item and "text" in item]
84
+ item_dict = {item["owner_id"]: item for item in items}
85
+ # print(query_results, items, flush=True)
86
+ owner_ids = np.array([item["owner_id"] for item in items])
87
+ cities = get_post_city(owner_ids)
88
+ city_pos = get_city_position(cities)
89
+ # likes = item.get("likes", {"count": 0})["count"]
90
+ for id, pos in city_pos.items():
91
+ if pos is None:
92
+ continue
93
+ posts.append(Post(item_dict[id]["text"], cities[id], pos, id))
94
+ assert all(post.geolocation is not None for post in posts)
95
+ return posts
96
+
97
+
98
+ def get_post_city(owner_id: np.ndarray) -> dict[int, str | None]:
99
+ group_ids = -owner_id[owner_id < 0]
100
+ user_ids = owner_id[owner_id > 0]
101
+ assert len(group_ids) + len(user_ids) == len(owner_id)
102
+ # print(group_ids, user_ids, owner_id, flush=True)
103
+ if len(group_ids) > 0:
104
+ groups = vk.groups.getById(group_ids=list(group_ids), fields=['city', 'country'])
105
+ groups_dict = {-group["id"]: group.get("city", None) for group in groups}
106
+ else:
107
+ groups_dict = {}
108
+ if len(user_ids) > 0:
109
+ users = vk.users.get(user_ids=list(user_ids), fields=['city', 'country'])
110
+ users_dict = {user["id"]: user.get("city", None) for user in users}
111
+ else:
112
+ users_dict = {}
113
+
114
+ users_dict.update(groups_dict)
115
+ return {id: city["title"] if city is not None else None for id, city in users_dict.items()}
116
+
117
+
118
+ def get_city_position(cities: dict[int, str | None]) -> dict[int, tuple[float, float] | None]:
119
+ res = {}
120
+ for id, city in cities.items():
121
+ if city is None:
122
+ res[id] = None
123
+ continue
124
+ selected = cities_db[cities_db["city"] == city]
125
+ if len(selected) == 0:
126
+ res[id] = None
127
+ continue
128
+ # print(selected)
129
+ res[id] = (selected["lat"].iloc[0], selected["lon"].iloc[0])
130
+ assert len(cities) == len(res)
131
+ return res
132
+
133
+
134
+ def search_posts_by_pos(query: str, num_of_posts: int, city_name: str, lat: float, lon: float) -> list[Post]:
135
+ posts: list[Post] = []
136
+ offset = 0
137
+ request_count = min(num_of_posts, 200)
138
+ while len(posts) < num_of_posts:
139
+ query_results = vk.newsfeed.search(q=query, count=request_count, offset=offset, latitude=lat, longtitude=lon)
140
+ items = [item for item in query_results["items"] if "text" in item]
141
+ for item in items:
142
+ posts.append(Post(item["text"], city_name, (lat, lon), item.get("owner_id", 0)))
143
+ offset += request_count
144
+ print(f"For city {city_name} processed {offset} posts, added {len(posts)}.", flush=True)
145
+ return posts[:num_of_posts]
towns.csv ADDED
The diff for this file is too large to render. See raw diff