Spaces:

michaelryt
/

Twitter_Sentiment_Analysis

Build error

Michael Rey

added all files

74c7b8b 12 months ago

5.81 kB

	import streamlit as st
	import pandas as pd
	import numpy as np
	from sklearn.preprocessing import StandardScaler, LabelEncoder
	from sklearn.cluster import DBSCAN
	from sklearn.ensemble import RandomForestClassifier
	import matplotlib.pyplot as plt
	import seaborn as sns

	# Load the dataset
	def load_data():
	return pd.read_csv("Animal Dataset.csv")

	# Convert ranges to float (if needed)
	def convert_range_to_float(val):
	try:
	if isinstance(val, str) and '-' in val:
	start, end = map(float, val.split('-'))
	return (start + end) / 2
	else:
	return float(val)
	except:
	return np.nan

	# Title
	st.title("🐾 Animal Diet Prediction and Lifespan Clustering")
	st.markdown("#### Check animal diet and group them according to lifespan using Random Forest Classifier and DBSCAN")

	tabs = st.tabs(["📊 Supervised Learning", "🔍 Unsupervised Learning"])

	data = load_data()

	# Select features to use
	numeric_features = ['Height (cm)', 'Weight (kg)', 'Average Speed (km/h)', 'Top Speed (km/h)', 'Gestation Period (days)', 'Offspring per Birth', 'Lifespan (years)']

	# Clean the features
	for col in numeric_features:
	data[col] = data[col].apply(convert_range_to_float)

	data = data.dropna(subset=numeric_features)

	# Encode the target for supervised learning
	if 'Diet' in data.columns:
	label_encoder = LabelEncoder()
	data['Diet_encoded'] = label_encoder.fit_transform(data['Diet'])

	# Tab 1: Supervised Learning
	with tabs[0]:
	st.header("Supervised: Predict Animal Diet Type")

	# Sliders
	st.subheader("Adjust the animal's characteristics to predict its diet")
	height = st.slider("Height (cm)", min_value=int(data['Height (cm)'].min()), max_value=int(data['Height (cm)'].max()), help="How tall is the animal?")
	weight = st.slider("Weight (kg)", min_value=int(data['Weight (kg)'].min()), max_value=int(data['Weight (kg)'].max()), help="How much does the animal weigh?")
	speed = st.slider("Average Speed (km/h)", min_value=int(data['Average Speed (km/h)'].min()), max_value=int(data['Average Speed (km/h)'].max()), help="How fast can the animal move?")
	top_speed = st.slider("Top Speed (km/h)", min_value=int(data['Top Speed (km/h)'].min()), max_value=int(data['Top Speed (km/h)'].max()), help="What is the maximum speed the animal can reach?")
	gestation = st.slider("Gestation Period (days)", min_value=int(data['Gestation Period (days)'].min()), max_value=int(data['Gestation Period (days)'].max()), help="How long is the animal's pregnancy?")
	offspring = st.slider("Offspring per Birth", min_value=int(data['Offspring per Birth'].min()), max_value=int(data['Offspring per Birth'].max()), help="How many offspring does the animal give birth to at once?")

	# Prepare input data for prediction (using the same features)
	input_data = pd.DataFrame({
	'Height (cm)': [height],
	'Weight (kg)': [weight],
	'Average Speed (km/h)': [speed],
	'Top Speed (km/h)': [top_speed],
	'Gestation Period (days)': [gestation],
	'Offspring per Birth': [offspring]
	})

	# Train a RandomForest Classifier for supervised learning
	X = data[numeric_features[:-1]]
	y = data['Diet_encoded']
	clf = RandomForestClassifier()
	clf.fit(X, y)
	diet_pred = clf.predict(input_data)

	# Show prediction result
	predicted_diet = label_encoder.inverse_transform(diet_pred)
	st.subheader(f"Predicted Diet: {predicted_diet[0]}")

	# Tab 2: Unsupervised Learning
	with tabs[1]:
	st.header("Unsupervised: Group Animals Based on Lifespan")
	st.write("In this part, we will group animals based on their lifespan. This helps to identify patterns in how different animals live and how long they survive.")

	# Select minimum and maximum Lifespan
	st.subheader("Choose the lifespan range to group animals")
	lifespan_min = st.slider("Lifespan Min (years)", min_value=int(data['Lifespan (years)'].min()), max_value=int(data['Lifespan (years)'].max()), help="The minimum lifespan of the animals you are interested in.")
	lifespan_max = st.slider("Lifespan Max (years)", min_value=int(data['Lifespan (years)'].min()), max_value=int(data['Lifespan (years)'].max()), help="The maximum lifespan of the animals you are interested in.")

	# Filter dataset by lifespan
	filtered_data = data[(data['Lifespan (years)'] >= lifespan_min) & (data['Lifespan (years)'] <= lifespan_max)]

	# Check if there is data available after filtering
	if filtered_data.empty:
	st.warning("No animals found with the selected lifespan range. Please adjust the sliders.")
	else:
	# Scale the data for DBSCAN (automatically chosen parameters)
	scaler = StandardScaler()
	X_scaled = scaler.fit_transform(filtered_data[numeric_features[:-1]]) # Don't use 'Lifespan (years)' in unsupervised learning

	# Automatically run DBSCAN with default parameters
	db = DBSCAN(eps=1.5, min_samples=5)
	clusters = db.fit_predict(X_scaled)
	filtered_data['Cluster'] = clusters

	st.subheader("Clustered Animals by Lifespan")
	st.write(f"Animals are grouped based on their characteristics (except lifespan). The animals in the same cluster share similarities.")

	# Show the animals matching the selected lifespan range
	st.subheader("Animals in the Selected Lifespan Range")
	st.dataframe(filtered_data[['Animal', 'Lifespan (years)', 'Weight (kg)', 'Cluster']])

	# Plot: Reduce the size of the plot
	st.subheader("Cluster Plot (Lifespan vs Weight)")
	fig, ax = plt.subplots(figsize=(10, 6))
	sns.scatterplot(x=filtered_data['Lifespan (years)'], y=filtered_data['Weight (kg)'], hue=clusters, palette="tab10", ax=ax)
	st.pyplot(fig)