| """ |
| PyPilot Data Preprocessor - Handles massive code datasets |
| """ |
| import json |
| import pickle |
| import multiprocessing as mp |
| from pathlib import Path |
| from datasets import load_dataset |
| import tokenizers |
| from tokenizers import Tokenizer |
| from tokenizers.models import BPE |
| from tokenizers.trainers import BpeTrainer |
| from tokenizers.pre_tokenizers import Whitespace |
|
|
| class PyPilotDataPreprocessor: |
| def __init__(self): |
| self.supported_languages = ['python', 'javascript', 'java', 'cpp', 'go', 'rust'] |
| self.processed_data = {} |
| |
| def load_github_dataset(self, language='python', split='train'): |
| """Load massive code dataset from Hugging Face""" |
| print(f"π₯ Loading {language} code dataset...") |
| try: |
| dataset = load_dataset("codeparrot/github-code", split=split, languages=[language]) |
| print(f"β
Loaded {len(dataset)} {language} files") |
| return dataset |
| except Exception as e: |
| print(f"β Error loading dataset: {e}") |
| return None |
| |
| def build_tokenizer(self, dataset, vocab_size=50000): |
| """Build custom tokenizer for code""" |
| print("π€ Building custom code tokenizer...") |
| |
| tokenizer = Tokenizer(BPE()) |
| tokenizer.pre_tokenizer = Whitespace() |
| |
| trainer = BpeTrainer( |
| vocab_size=vocab_size, |
| special_tokens=["[PAD]", "[UNK]", "[CLS]", "[SEP]", "[MASK]", "[EOL]"] |
| ) |
| |
| |
| def batch_iterator(batch_size=1000): |
| for i in range(0, len(dataset), batch_size): |
| yield dataset[i:i+batch_size]['code'] |
| |
| tokenizer.train_from_iterator(batch_iterator(), trainer=trainer) |
| tokenizer.save("./pypilot_tokenizer.json") |
| print("β
Tokenizer built and saved!") |
| return tokenizer |
| |
| def parallel_process_files(self, file_paths, num_processes=8): |
| """Process files in parallel for maximum speed""" |
| print(f"β‘ Processing {len(file_paths)} files with {num_processes} processes...") |
| |
| def process_file(file_path): |
| try: |
| with open(file_path, 'r', encoding='utf-8') as f: |
| content = f.read() |
| return { |
| 'file_path': str(file_path), |
| 'content': content, |
| 'length': len(content), |
| 'language': self.detect_language(file_path) |
| } |
| except Exception as e: |
| return {'error': str(e), 'file_path': str(file_path)} |
| |
| with mp.Pool(num_processes) as pool: |
| results = pool.map(process_file, file_paths) |
| |
| successful = [r for r in results if 'error' not in r] |
| print(f"β
Processed {len(successful)} files successfully") |
| return successful |
| |
| def detect_language(self, file_path): |
| """Detect programming language from file extension""" |
| extensions = { |
| '.py': 'python', |
| '.js': 'javascript', |
| '.java': 'java', |
| '.cpp': 'cpp', |
| '.cc': 'cpp', |
| '.go': 'go', |
| '.rs': 'rust', |
| '.ts': 'typescript' |
| } |
| return extensions.get(Path(file_path).suffix, 'unknown') |
| |
| def create_training_pairs(self, code_samples, context_size=512): |
| """Create (input, target) pairs for training""" |
| print("π Creating training pairs...") |
| training_pairs = [] |
| |
| for sample in code_samples: |
| code = sample.get('content', '') |
| if len(code) > context_size: |
| |
| for i in range(0, len(code) - context_size, context_size // 2): |
| input_chunk = code[i:i + context_size] |
| target_chunk = code[i + 1:i + context_size + 1] |
| training_pairs.append({ |
| 'input': input_chunk, |
| 'target': target_chunk, |
| 'language': sample.get('language', 'unknown') |
| }) |
| |
| print(f"β
Created {len(training_pairs)} training pairs") |
| return training_pairs |
|
|
| if __name__ == "__main__": |
| preprocessor = PyPilotDataPreprocessor() |
| |
| |
| dataset = preprocessor.load_github_dataset('python') |
| if dataset: |
| tokenizer = preprocessor.build_tokenizer(dataset) |
| training_data = preprocessor.create_training_pairs(dataset) |
| |
| |
| with open('processed_training_data.pkl', 'wb') as f: |
| pickle.dump(training_data, f) |
| print("πΎ Training data saved!") |