backend / simulation /utils /profile_generator.py
vish85521's picture
Upload 182 files
66f749a verified
import random
import logging
from typing import List, Dict, Any, Optional
logger = logging.getLogger(__name__)
class ProfileGenerator:
"""
Generate realistic agent profiles based on demographic data
"""
AGE_DISTRIBUTION = {
(18, 24): 0.15,
(25, 34): 0.25,
(35, 44): 0.22,
(45, 54): 0.18,
(55, 64): 0.12,
(65, 80): 0.08
}
LOCATION_DISTRIBUTION = {
"Colombo": 0.14,
"Dehiwala-Mount Lavinia": 0.06,
"Moratuwa": 0.05,
"Negombo": 0.04,
"Sri Jayawardenepura Kotte": 0.04,
"Kandy": 0.06,
"Galle": 0.04,
"Jaffna": 0.03,
"Trincomalee": 0.03,
"Batticaloa": 0.03,
"Anuradhapura": 0.04,
"Polonnaruwa": 0.02,
"Kurunegala": 0.04,
"Ratnapura": 0.03,
"Badulla": 0.03,
"Matara": 0.03,
"Hambantota": 0.02,
"Vavuniya": 0.02,
"Nuwara Eliya": 0.03,
"Kalmunai": 0.02,
"Ampara": 0.02,
"Kalutara": 0.03,
"Gampaha": 0.06,
"Puttalam": 0.02,
"Mannar": 0.01,
}
BASE_COORDINATES = {
"Colombo": [6.9271, 79.8612],
"Dehiwala-Mount Lavinia": [6.8649, 79.8653],
"Moratuwa": [6.7731, 79.8816],
"Negombo": [7.2097, 79.8356],
"Sri Jayawardenepura Kotte": [6.8924, 79.9022],
"Kandy": [7.2906, 80.6337],
"Galle": [6.0328, 80.2150],
"Jaffna": [9.6615, 80.0255],
"Trincomalee": [8.5874, 81.2152],
"Batticaloa": [7.7170, 81.6924],
"Anuradhapura": [8.3114, 80.4037],
"Polonnaruwa": [7.9403, 81.0188],
"Kurunegala": [7.4863, 80.3647],
"Ratnapura": [6.6828, 80.3992],
"Badulla": [6.9934, 81.0550],
"Matara": [5.9496, 80.5353],
"Hambantota": [6.1241, 81.1185],
"Vavuniya": [8.7514, 80.4971],
"Nuwara Eliya": [6.9497, 80.7891],
"Kalmunai": [7.4148, 81.8262],
"Ampara": [7.2975, 81.6724],
"Kalutara": [6.5854, 79.9607],
"Gampaha": [7.0873, 79.9995],
"Puttalam": [8.0362, 79.8283],
"Mannar": [8.9810, 79.9044],
}
VALUES = [
"family_oriented", "traditional", "modern", "environmentally_conscious",
"religious", "career_focused", "community_oriented", "individualistic",
"health_conscious", "tech_savvy", "budget_conscious", "luxury_oriented",
"socially_aware", "politically_active"
]
PERSONALITY_TRAITS = [
"Analytical", "Empathetic", "Traditional", "Ambitious",
"Skeptical", "Optimistic", "Cautious", "Social",
"Independent", "Loyal", "Creative", "Pragmatic"
]
INCOME_LEVELS = ["Below Poverty Line", "Lower Income", "Lower Middle Income", "Middle Income", "Upper Middle Income", "Upper Income"]
SOCIAL_MEDIA_USAGE = ["Very High", "High", "Moderate", "Low", "None"]
POLITICAL_LEANING = ["Progressive", "Moderate", "Conservative", "Nationalist", "Apolitical"]
OCCUPATIONS_YOUNG = ["Student", "Junior Developer", "Marketing Associate", "Content Creator", "Freelancer"]
OCCUPATIONS_MID = ["Teacher", "Engineer", "Doctor", "Manager", "Business Owner", "Accountant", "Lawyer"]
OCCUPATIONS_SENIOR = ["Senior Manager", "Consultant", "Professor", "Retired", "Business Owner"]
EDUCATION_LEVELS = ["High School", "Bachelor's", "Master's", "PhD", "Professional Certification", "No Formal Education"]
@classmethod
def generate_profiles(
cls,
n: int = 1000,
demographic_filter: Optional[Dict[str, Any]] = None
) -> List[Dict[str, Any]]:
profiles = []
for i in range(n):
profiles.append(cls._generate_single_profile(i, demographic_filter))
return profiles
@classmethod
def _generate_single_profile(
cls,
index: int,
demographic_filter: Optional[Dict[str, Any]] = None
) -> Dict[str, Any]:
# Age
if demographic_filter and 'age_range' in demographic_filter and demographic_filter['age_range']:
age_range = demographic_filter['age_range']
age = random.randint(age_range[0], age_range[1])
else:
age_bracket = random.choices(list(cls.AGE_DISTRIBUTION.keys()), weights=list(cls.AGE_DISTRIBUTION.values()))[0]
age = random.randint(*age_bracket)
# Gender
if demographic_filter and 'gender' in demographic_filter and demographic_filter['gender'] and demographic_filter['gender'] != 'All':
gender = demographic_filter['gender']
else:
gender = random.choice(["Male", "Female"])
# Location
if demographic_filter and 'location' in demographic_filter and demographic_filter['location'] and demographic_filter['location'] != 'All':
location = demographic_filter['location']
else:
location = random.choices(list(cls.LOCATION_DISTRIBUTION.keys()), weights=list(cls.LOCATION_DISTRIBUTION.values()))[0]
# Occupation
if age < 25:
occupation = random.choice(cls.OCCUPATIONS_YOUNG)
elif age < 55:
occupation = random.choice(cls.OCCUPATIONS_MID)
else:
occupation = random.choice(cls.OCCUPATIONS_SENIOR)
# Education
if demographic_filter and 'education' in demographic_filter and demographic_filter['education']:
education = random.choice(demographic_filter['education'])
elif age < 22:
education = "High School"
else:
education = random.choices(cls.EDUCATION_LEVELS, weights=[0.2, 0.40, 0.25, 0.05, 0.05, 0.05])[0]
# Values
if demographic_filter and 'values' in demographic_filter and demographic_filter['values']:
required_values = demographic_filter['values']
other_values = [v for v in cls.VALUES if v not in required_values]
extra_values = random.sample(other_values, min(2, len(other_values)))
values = required_values + extra_values
else:
num_values = random.randint(2, 4)
values = random.sample(cls.VALUES, num_values)
# Coordinates with jitter
base_coords = cls.BASE_COORDINATES.get(location, [7.8731, 80.7718])
# Default jitter
lat_min, lat_max = -0.05, 0.05
lng_min, lng_max = -0.05, 0.05
# Prevent jittering into the sea for coastal cities
west_coast_cities = ["Colombo", "Dehiwala-Mount Lavinia", "Moratuwa", "Negombo", "Kalutara", "Puttalam", "Mannar"]
east_coast_cities = ["Trincomalee", "Batticaloa", "Kalmunai"]
south_coast_cities = ["Galle", "Matara", "Hambantota"]
north_coast_cities = ["Jaffna"]
if location in west_coast_cities:
lng_min = 0.0 # Shift East
elif location in east_coast_cities:
lng_max = 0.0 # Shift West
if location in south_coast_cities:
lat_min = 0.0 # Shift North
elif location in north_coast_cities:
lat_max = 0.0 # Shift South
lat = base_coords[0] + random.uniform(lat_min, lat_max)
lng = base_coords[1] + random.uniform(lng_min, lng_max)
# Income Level
if demographic_filter and 'income_level' in demographic_filter and demographic_filter['income_level']:
income_level = random.choice(demographic_filter['income_level'])
elif occupation in ["Student", "Retired"] or age < 22:
income_level = random.choices(cls.INCOME_LEVELS, weights=[0.1, 0.4, 0.3, 0.15, 0.05, 0.0])[0]
elif occupation in ["Business Owner", "Senior Manager", "Doctor", "Lawyer"]:
income_level = random.choices(cls.INCOME_LEVELS, weights=[0.0, 0.0, 0.1, 0.3, 0.4, 0.2])[0]
else:
income_level = random.choices(cls.INCOME_LEVELS, weights=[0.05, 0.2, 0.3, 0.3, 0.1, 0.05])[0]
# Religion and Ethnicity
if location in ["Jaffna", "Vavuniya", "Mannar", "Batticaloa", "Trincomalee"]:
religions = ["Hindu", "Christian", "Muslim", "Buddhist"]
rel_weights = [0.65, 0.15, 0.15, 0.05]
ethnicities = ["Tamil", "Moor", "Sinhalese", "Burgher"]
eth_weights = [0.80, 0.10, 0.05, 0.05]
elif location in ["Kalmunai", "Ampara"]:
religions = ["Muslim", "Buddhist", "Hindu", "Christian"]
rel_weights = [0.55, 0.30, 0.10, 0.05]
ethnicities = ["Moor", "Sinhalese", "Tamil", "Burgher"]
eth_weights = [0.55, 0.30, 0.10, 0.05]
else:
religions = ["Buddhist", "Hindu", "Muslim", "Christian"]
rel_weights = [0.70, 0.13, 0.10, 0.07]
ethnicities = ["Sinhalese", "Tamil", "Moor", "Burgher"]
eth_weights = [0.74, 0.15, 0.09, 0.02]
if demographic_filter and 'religion' in demographic_filter and demographic_filter['religion']:
religion = random.choice(demographic_filter['religion'])
else:
religion = random.choices(religions, weights=rel_weights)[0]
if demographic_filter and 'ethnicity' in demographic_filter and demographic_filter['ethnicity']:
ethnicity = random.choice(demographic_filter['ethnicity'])
else:
ethnicity = random.choices(ethnicities, weights=eth_weights)[0]
# Name Generation based on Ethnicity
if ethnicity == "Sinhalese":
first_names = ["Nuwan", "Chamara", "Dilanka", "Sachini", "Nimasha", "Kasun", "Tharaka", "Malsha", "Dinuka", "Sandali"]
surnames = ["Perera", "Silva", "Fernando", "Jayasinghe", "Wickramasinghe", "Gunasekara", "Rajapaksa", "Dissanayake", "Bandara"]
elif ethnicity == "Tamil":
first_names = ["Arjun", "Priya", "Kavitha", "Suresh", "Anitha", "Rajan", "Meena", "Vijay", "Lakshmi", "Krishnan"]
surnames = ["Nair", "Pillai", "Shankar", "Murugan", "Selvam", "Balasingham", "Ratnasingham", "Thambipillai"]
elif ethnicity == "Moor":
first_names = ["Mohamed", "Fathima", "Hassan", "Ayesha", "Ibrahim", "Zainab", "Rashid", "Nusrath", "Farhan", "Shifana"]
surnames = ["Marikar", "Lafir", "Cader", "Zarook", "Ismail", "Saheed"]
else: # Burgher / Other
first_names = ["Jerome", "Michelle", "Kevin", "Sandra", "Brian", "Karen"]
surnames = ["de Silva", "van Dort", "Ondaatje", "Grenier"]
name = f"{random.choice(first_names)} {random.choice(surnames)}"
# Social Media Usage
if demographic_filter and 'social_media_usage' in demographic_filter and demographic_filter['social_media_usage']:
social_media_usage = random.choice(demographic_filter['social_media_usage'])
elif age < 30:
social_media_usage = random.choices(cls.SOCIAL_MEDIA_USAGE, weights=[0.4, 0.4, 0.15, 0.05, 0.0])[0]
elif age < 50:
social_media_usage = random.choices(cls.SOCIAL_MEDIA_USAGE, weights=[0.1, 0.3, 0.4, 0.15, 0.05])[0]
else:
social_media_usage = random.choices(cls.SOCIAL_MEDIA_USAGE, weights=[0.0, 0.1, 0.3, 0.4, 0.2])[0]
# Political Leaning
if demographic_filter and 'political_leaning' in demographic_filter and demographic_filter['political_leaning']:
political_leaning = random.choice(demographic_filter['political_leaning'])
else:
political_leaning = random.choice(cls.POLITICAL_LEANING)
# Personality Traits
personality_traits = random.sample(cls.PERSONALITY_TRAITS, random.randint(2, 3))
return {
"agent_id": f"agent_{index:04d}",
"name": name,
"age": age,
"gender": gender,
"location": location,
"coordinates": [lat, lng],
"occupation": occupation,
"education": education,
"income_level": income_level,
"religion": religion,
"ethnicity": ethnicity,
"social_media_usage": social_media_usage,
"political_leaning": political_leaning,
"personality_traits": personality_traits,
"values": values,
"bio": ""
}
@classmethod
def generate_social_network(
cls,
profiles: List[Dict[str, Any]],
avg_friends: int = 10
) -> Dict[str, List[str]]:
network = {p['agent_id']: [] for p in profiles}
for profile in profiles:
agent_id = profile['agent_id']
candidates = []
for other in profiles:
if other['agent_id'] == agent_id:
continue
score = 0
if other['location'] == profile['location']:
score += 3
shared_values = set(profile['values']) & set(other['values'])
score += len(shared_values) * 2
age_diff = abs(profile['age'] - other['age'])
if age_diff <= 10:
score += 2
elif age_diff <= 20:
score += 1
if score > 0:
candidates.append((other['agent_id'], score))
candidates.sort(key=lambda x: x[1], reverse=True)
num_friends = max(1, int(random.gauss(avg_friends, 3)))
num_friends = min(num_friends, len(candidates))
if candidates:
selected = []
for cand_id, score in candidates[:num_friends * 2]:
if len(selected) >= num_friends:
break
if random.random() < (score / 10):
selected.append(cand_id)
network[agent_id] = selected
return network