| import csv |
| import sys |
| from collections import defaultdict |
| from statistics import mean, variance |
|
|
|
|
| |
| |
| |
| def load_csv(path): |
| keywords = [] |
| stars = [] |
|
|
| with open(path, "r", encoding="utf-8", errors="replace") as f: |
| reader = csv.DictReader(f) |
| for row in reader: |
| if "keyword" in row: |
| keywords.append(row["keyword"]) |
|
|
| if "stars" in row: |
| try: |
| stars.append(int(row["stars"])) |
| except: |
| pass |
|
|
| return keywords, stars |
|
|
|
|
| |
| |
| |
| def make_bins(start, end, step): |
| bins = list(range(start, end + step, step)) |
| labels = [f"{bins[i]}-{bins[i+1]}" for i in range(len(bins)-1)] |
| labels.append(f"{end}+") |
| return bins, labels |
|
|
|
|
| def distribute(values, bins, labels): |
| dist = {label: 0 for label in labels} |
|
|
| for v in values: |
| placed = False |
| for i in range(len(bins) - 1): |
| if bins[i] <= v < bins[i+1]: |
| dist[labels[i]] += 1 |
| placed = True |
| break |
| if not placed: |
| dist[labels[-1]] += 1 |
|
|
| return dist |
|
|
|
|
| |
| |
| |
| def analyze_csv(path, start, end, step): |
|
|
| print(f"\n========== 分析 CSV 文件:{path} ==========") |
|
|
| keywords, stars = load_csv(path) |
|
|
| print("\n========== keyword 分布 ==========") |
| keyword_count = defaultdict(int) |
|
|
| for kw in keywords: |
| keyword_count[kw] += 1 |
|
|
| total_keywords = len(keywords) |
|
|
| for kw, cnt in sorted(keyword_count.items(), key=lambda x: -x[1]): |
| pct = cnt / total_keywords * 100 |
| print(f"{kw}: {cnt} ({pct:.2f}%)") |
|
|
| print("\n========== stars 统计 ==========") |
| if len(stars) == 0: |
| print("没有 stars 字段或无有效数据") |
| else: |
| print(f"个数: {len(stars)}") |
| print(f"最小值: {min(stars)}") |
| print(f"最大值: {max(stars)}") |
| print(f"均值: {mean(stars):.2f}") |
| if len(stars) >= 2: |
| print(f"方差: {variance(stars):.2f}") |
|
|
| |
| bins, labels = make_bins(start, end, step) |
| dist = distribute(stars, bins, labels) |
|
|
| print(f"区间分布") |
| for lab in labels: |
| cnt = dist[lab] |
| pct = cnt / len(stars) * 100 |
| print(f"{lab}: {cnt} ({pct:.2f}%)") |
|
|
|
|
| if __name__ == "__main__": |
| path = "/home/weifengsun/tangou1/domain_code/src/workdir/repos_checked.csv" |
| start = 0 |
| end = 200 |
| step = 20 |
| analyze_csv(path, start, end, step) |
|
|