| import pandas as pd |
| import os |
| import shutil |
|
|
|
|
| class data_cache: |
|
|
| def __init__(self, file_path) -> None: |
| self.file_path = file_path |
| self.data_cache_path = os.path.dirname(file_path) |
| file_extension = os.path.splitext(file_path)[1].lower() |
|
|
| if file_extension == '.csv': |
| self.data = pd.read_csv(self.file_path, encoding='utf-8') |
| elif file_extension in ['.xlsx', '.xls']: |
| self.data = pd.read_excel(self.file_path) |
| else: |
| raise ValueError(f"Unsupported file format: {file_extension}") |
| self.general_info = {} |
|
|
| def get_description(self) -> dict: |
| |
| general_info = get_general_info(self.data) |
| self.general_info["num_rows"], self.general_info["num_features"], self.general_info["features"], \ |
| self.general_info["col_type"], self.general_info["missing_val"] = general_info["num_rows"], \ |
| general_info["num_features"], general_info["features"], general_info["col_type"], general_info[ |
| "missing_val"] |
|
|
| self.general_info["describe"] = self.data.describe() |
| |
| return self.general_info |
|
|
|
|
| def get_general_info(data: pd.DataFrame): |
| return {"num_rows": data.shape[0], "num_features": data.shape[1], "features": data.columns, |
| "col_type": data.dtypes, "missing_val": data.isnull().sum()} |
|
|
|
|