| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
|
|
| """ |
| NeMo dependency structure definition. |
| This module analyzes the codebase to determine internal dependencies between NeMo collections and core components. |
| """ |
|
|
| import ast |
| import json |
| import os |
| from typing import Dict, List, Set |
|
|
|
|
| def find_python_files(directory: str) -> List[str]: |
| """Find all Python files in the given directory and its subdirectories.""" |
| python_files = [] |
| |
| relevant_dirs = ['nemo', 'scripts', 'examples', 'tests'] |
|
|
| for dir_name in relevant_dirs: |
| dir_path = os.path.join(directory, dir_name) |
| if os.path.exists(dir_path): |
| for root, _, files in os.walk(dir_path): |
| for file in files: |
| if file.endswith('.py'): |
| python_files.append(os.path.join(root, file)) |
|
|
| return python_files |
|
|
|
|
| def analyze_imports(nemo_root: str, file_path: str) -> Set[str]: |
| """Analyze a Python file and return its NeMo package dependencies using AST parsing.""" |
| imports = set() |
| visited = set() |
|
|
| def get_init_imports(module_path: str, depth: int = 0) -> Dict[str, str]: |
| """Recursively analyze imports from __init__.py files and map them to their final destinations.""" |
| |
| if depth > 10 or module_path in visited: |
| return {} |
|
|
| visited.add(module_path) |
| init_path = os.path.join(module_path, '__init__.py') |
| if not os.path.exists(init_path): |
| return {} |
|
|
| try: |
| with open(init_path, 'r', encoding='utf-8') as f: |
| init_tree = ast.parse(f.read(), filename=init_path) |
|
|
| import_map = {} |
| for node in ast.walk(init_tree): |
| if isinstance(node, ast.ImportFrom) and node.module and node.module.startswith('nemo.'): |
| if node.names: |
| for name in node.names: |
| if name.name == '*': |
| continue |
|
|
| |
| module_parts = node.module.split('.') |
| module_dir = os.path.join(nemo_root, *module_parts) |
|
|
| |
| if os.path.exists(os.path.join(module_dir, '__init__.py')): |
| sub_imports = get_init_imports(module_dir, depth + 1) |
| if name.name in sub_imports: |
| import_map[name.name] = sub_imports[name.name] |
| else: |
| |
| module_file = os.path.join(module_dir, f"{module_parts[-1]}.py") |
| if os.path.exists(module_file): |
| import_map[name.name] = f"{node.module}.{name.name}" |
| else: |
| |
| import_map[name.name] = f"{node.module}.{name.name}" |
|
|
| return import_map |
| except Exception as e: |
| print(f"Error analyzing {init_path}: {e}") |
| return {} |
|
|
| try: |
| with open(file_path, 'r', encoding='utf-8') as f: |
| tree = ast.parse(f.read(), filename=file_path) |
|
|
| for node in ast.walk(tree): |
| if isinstance(node, ast.ImportFrom) and node.module and node.module.startswith('nemo.'): |
| |
| parts = node.module.split('.') |
|
|
| if len(parts) == 1: |
| continue |
|
|
| if len(parts) >= 2: |
| module_type = parts[1] |
|
|
| if module_type == 'collections': |
| if len(parts) == 2: |
| continue |
| if node.names: |
| for name in node.names: |
| if name.name == '*': |
| continue |
|
|
| |
| module_path = os.path.join(nemo_root, *parts) |
| init_imports = get_init_imports(module_path) |
|
|
| if name.name in init_imports: |
| |
| imports.add(init_imports[name.name]) |
| else: |
| imports.add(f"{node.module}.{name.name}") |
|
|
| elif module_type in find_top_level_packages(nemo_root): |
| if node.names: |
| for name in node.names: |
| if name.name == '*': |
| continue |
|
|
| |
| module_path = os.path.join(nemo_root, *parts) |
| init_imports = get_init_imports(module_path) |
|
|
| if name.name in init_imports: |
| |
| imports.add(init_imports[name.name]) |
| else: |
| imports.add(f"{node.module}.{name.name}") |
|
|
| except Exception as e: |
| print(f"Error analyzing {file_path}: {e}") |
|
|
| return imports |
|
|
|
|
| def find_top_level_packages(nemo_root: str) -> List[str]: |
| """Find all top-level packages under nemo directory.""" |
| packages: List[str] = [] |
| nemo_dir = os.path.join(nemo_root, 'nemo') |
| tests_dir = os.path.join(nemo_root, 'tests') |
|
|
| if not os.path.exists(nemo_dir): |
| print(f"Warning: nemo directory not found at {nemo_dir}") |
| return packages |
| if not os.path.exists(tests_dir): |
| print(f"Warning: nemo directory not found at {nemo_dir}") |
| return packages |
|
|
| for item in os.listdir(nemo_dir) + os.listdir(tests_dir): |
| item_path = os.path.join(nemo_dir, item) |
| if os.path.isdir(item_path) and not item.startswith('__'): |
| packages.append(item) |
|
|
| return sorted(packages) |
|
|
|
|
| def find_collection_modules(nemo_root: str) -> Dict[str, List[str]]: |
| """Find all modules within collections.""" |
| collection_modules: Dict[str, List[str]] = {} |
| collections_dir = os.path.join(nemo_root, 'nemo', 'collections') |
|
|
| if not os.path.exists(collections_dir): |
| print(f"Warning: collections directory not found at {collections_dir}") |
| return collection_modules |
|
|
| for collection in os.listdir(collections_dir): |
| collection_path = os.path.join(collections_dir, collection) |
| if os.path.isdir(collection_path) and not collection.startswith('__'): |
| collection_modules[f"nemo.collections.{collection}"] = [] |
|
|
| return collection_modules |
|
|
|
|
| def build_dependency_graph(nemo_root: str) -> Dict[str, List[str]]: |
| """Build a dependency graph by analyzing all Python files.""" |
| |
| top_level_packages = find_top_level_packages(nemo_root) |
| print(f"Found top-level packages: {top_level_packages}") |
|
|
| dependencies: Dict[str, List[str]] = {} |
|
|
| for file_path in find_python_files(nemo_root): |
| relative_path = os.path.relpath(file_path, nemo_root) |
|
|
| parts = relative_path.split(os.sep) |
|
|
| if len(parts) == 1 or (parts[0] != "nemo" and parts[0] != "tests"): |
| continue |
|
|
| module_path = relative_path.replace(".py", "").replace("/", ".") |
| if parts[1] in top_level_packages and parts[1] != 'collections' and parts[0] != 'tests': |
| dependencies[module_path] = list(set(analyze_imports(nemo_root, file_path))) |
| elif parts[0] == 'tests': |
| dependencies[module_path] = [relative_path.replace("/", ".").replace(".py", "")] |
| elif parts[1] == 'collections': |
| dependencies[module_path] = list(set(analyze_imports(nemo_root, file_path))) |
|
|
| |
| reverse_dependencies: Dict[str, List[str]] = {} |
| |
| for package, deps in dependencies.items(): |
| for dep in deps: |
| if dep not in reverse_dependencies: |
| reverse_dependencies[dep] = [] |
| reverse_dependencies[dep].append(package) |
| dependencies = reverse_dependencies |
|
|
| |
| transitive_dependencies = dependencies.copy() |
| |
| while True: |
| changes_made = False |
| new_dependencies = transitive_dependencies.copy() |
|
|
| |
| for package, deps in transitive_dependencies.items(): |
| |
| for dep in deps: |
| |
| if dep in transitive_dependencies: |
| |
| for transitive_dep in transitive_dependencies[dep]: |
| if transitive_dep not in new_dependencies[package]: |
| new_dependencies[package].append(transitive_dep) |
| changes_made = True |
|
|
| |
| transitive_dependencies = new_dependencies |
|
|
| |
| if not changes_made: |
| break |
|
|
| dependencies = transitive_dependencies |
|
|
| |
| simplified_dependencies: Dict[str, List[str]] = {} |
| for package, deps in dependencies.items(): |
| package_parts = package.split('.') |
|
|
| if package_parts[0] == "tests": |
| simplified_package_path = f"{os.path.join(*package_parts)}.py" |
| elif os.path.isfile((file_path := f"{os.path.join(*package_parts[:-1])}.py")): |
| simplified_package_path = file_path |
| elif os.path.isdir((file_path := f"{os.path.join(*package_parts[:-1])}")): |
| simplified_package_path = file_path |
| else: |
| simplified_package_path = package |
|
|
| for dep in deps: |
| dep_parts = dep.split('.') |
|
|
| if simplified_package_path not in simplified_dependencies: |
| simplified_dependencies[simplified_package_path] = [] |
|
|
| if ( |
| len(dep_parts) >= 2 |
| and (dep_parts[1] in find_top_level_packages(nemo_root)) |
| and dep_parts[1] != 'collections' |
| ): |
| simplified_dependencies[simplified_package_path].append(f"{dep_parts[0]}.{dep_parts[1]}") |
| elif dep_parts[0] == "tests": |
| simplified_dependencies[simplified_package_path].append(".".join(dep_parts)) |
| elif len(dep_parts) >= 3 and ( |
| simplified_name := f"nemo.{dep_parts[1]}.{dep_parts[2]}" |
| ) in find_collection_modules(nemo_root): |
| simplified_dependencies[simplified_package_path].append(simplified_name) |
|
|
| simplified_dependencies[simplified_package_path].append(package) |
| simplified_dependencies[simplified_package_path] = sorted( |
| list(set(simplified_dependencies[simplified_package_path])) |
| ) |
| dependencies = simplified_dependencies |
|
|
| |
| bucket_deps: Dict[str, List[str]] = {} |
| for package, deps in dependencies.items(): |
| new_deps = [] |
| for dep in deps: |
| if ( |
| "nemo.collections.asr" in dep |
| or "nemo.collections.tts" in dep |
| or "nemo.collections.speechlm" in dep |
| or "nemo.collections.audio" in dep |
| or "tests.collections.asr" in dep |
| or "tests.collections.tts" in dep |
| or "tests.collections.speechlm" in dep |
| or "tests.collections.audio" in dep |
| ): |
| new_deps.append("speech") |
| new_deps.append("unit-tests") |
|
|
| if "nemo.export" in dep or "nemo.deploy" in dep or "tests.export" in dep or "tests.deploy" in dep: |
| new_deps.append("export-deploy") |
| new_deps.append("unit-tests") |
|
|
| if ( |
| "nemo.collections.llm" in dep |
| or "nemo.collections.vlm" in dep |
| or "nemo.automodel" in dep |
| or "tests.collections.llm" in dep |
| or "tests.collections.vlm" in dep |
| or "tests.automodel" in dep |
| ): |
| new_deps.append("automodel") |
| new_deps.append("unit-tests") |
|
|
| if "tests" in dep and "tests.functional_tests" not in dep: |
| new_deps.append("unit-tests") |
|
|
| if ( |
| "nemo.collections" in dep |
| and "nemo.collections.asr" not in dep |
| and "nemo.collections.tts" not in dep |
| and "nemo.collections.speechlm" not in dep |
| and "nemo.collections.audio" not in dep |
| and "tests.collections.asr" not in dep |
| and "tests.collections.tts" not in dep |
| and "tests.collections.speechlm" not in dep |
| and "tests.collections.audio" not in dep |
| ): |
| new_deps.append("nemo2") |
| new_deps.append("unit-tests") |
|
|
| bucket_deps[package] = sorted(list(set(new_deps))) |
|
|
| dependencies = bucket_deps |
|
|
| |
| |
| requirements_dir = os.path.join(nemo_root, "requirements") |
| if os.path.exists(requirements_dir): |
| for filename in os.listdir(requirements_dir): |
| filepath = os.path.join("requirements", filename) |
| relative_path = os.path.relpath(filepath, nemo_root) |
|
|
| dependencies[relative_path] = [ |
| "nemo2", |
| "unit-tests", |
| "speech", |
| "automodel", |
| "export-deploy", |
| ] |
|
|
| |
| for root, _, files in os.walk(nemo_root): |
| for file_path in files: |
| full_path = os.path.join(root, file_path) |
| relative_path = os.path.relpath(full_path, nemo_root) |
|
|
| if "cicd-main-export-deploy" in file_path: |
| dependencies[relative_path] = ["export-deploy"] |
| if "cicd-main-nemo2" in file_path: |
| dependencies[relative_path] = ["nemo2"] |
| if "cicd-main-speech" in file_path: |
| dependencies[relative_path] = ["speech"] |
| if "cicd-main-automodel" in file_path: |
| dependencies[relative_path] = ["automodel"] |
| if "cicd-main-unit-tests" in file_path: |
| dependencies[relative_path] = ["unit-tests"] |
| if "Dockerfile" in file_path: |
| dependencies[relative_path] = ["nemo2", "unit-tests", "speech", "automodel", "export-deploy"] |
|
|
| |
| dependencies = dict(sorted(dependencies.items(), key=lambda x: len(x[1]), reverse=True)) |
|
|
| return dependencies |
|
|
|
|
| def main(): |
| """Main function to analyze dependencies and output JSON.""" |
| |
| nemo_root = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) |
|
|
| |
| dependencies = build_dependency_graph(nemo_root) |
|
|
| |
| data = json.dumps(dependencies, indent=4) |
|
|
| with open('nemo_dependencies.json', 'w', encoding='utf-8') as f: |
| f.write(data) |
|
|
|
|
| if __name__ == "__main__": |
| main() |
|
|