| import streamlit as st |
| import javalang |
| import torch |
| import torch.nn.functional as F |
| import re |
| from transformers import AutoTokenizer, AutoModel |
| import warnings |
| import pandas as pd |
| import zipfile |
| import os |
|
|
| |
| st.set_page_config( |
| page_title="Java Code Clone Detector (IJaDataset 2.1)", |
| page_icon="🔍", |
| layout="wide" |
| ) |
|
|
| |
| warnings.filterwarnings("ignore") |
|
|
| |
| MODEL_NAME = "microsoft/codebert-base" |
| MAX_LENGTH = 512 |
| DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu') |
| DATASET_PATH = "archive (1).zip" |
|
|
| |
| @st.cache_resource |
| def load_models(): |
| try: |
| tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME) |
| model = AutoModel.from_pretrained(MODEL_NAME).to(DEVICE) |
| return tokenizer, model |
| except Exception as e: |
| st.error(f"Failed to load models: {str(e)}") |
| return None, None |
|
|
| @st.cache_resource |
| def load_dataset(): |
| try: |
| if not os.path.exists("Subject_CloneTypes_Directories"): |
| with zipfile.ZipFile(DATASET_PATH, 'r') as zip_ref: |
| zip_ref.extractall(".") |
| |
| clone_pairs = [] |
| base_path = "Subject_CloneTypes_Directories" |
| |
| for clone_type in ["Clone_Type1", "Clone_Type2", "Clone_Type3 - ST"]: |
| type_path = os.path.join(base_path, clone_type) |
| if os.path.exists(type_path): |
| for root, _, files in os.walk(type_path): |
| if files and len(files) >= 2: |
| with open(os.path.join(root, files[0]), 'r', encoding='utf-8') as f1: |
| code1 = f1.read() |
| with open(os.path.join(root, files[1]), 'r', encoding='utf-8') as f2: |
| code2 = f2.read() |
| clone_pairs.append({ |
| "type": clone_type, |
| "code1": code1, |
| "code2": code2 |
| }) |
| break |
| |
| return clone_pairs[:10] |
| except Exception as e: |
| st.error(f"Error loading dataset: {str(e)}") |
| return [] |
|
|
| tokenizer, code_model = load_models() |
| dataset_pairs = load_dataset() |
|
|
| def normalize_code(code): |
| try: |
| code = re.sub(r'//.*', '', code) |
| code = re.sub(r'/\*.*?\*/', '', code, flags=re.DOTALL) |
| code = re.sub(r'\s+', ' ', code).strip() |
| return code |
| except Exception: |
| return code |
|
|
| def get_embedding(code): |
| try: |
| code = normalize_code(code) |
| inputs = tokenizer( |
| code, |
| return_tensors="pt", |
| truncation=True, |
| max_length=MAX_LENGTH, |
| padding='max_length' |
| ).to(DEVICE) |
| |
| with torch.no_grad(): |
| outputs = code_model(**inputs) |
| |
| return outputs.last_hidden_state.mean(dim=1) |
| except Exception as e: |
| st.error(f"Error processing code: {str(e)}") |
| return None |
|
|
| def compare_code(code1, code2): |
| if not code1 or not code2: |
| return None |
| |
| with st.spinner('Analyzing code...'): |
| emb1 = get_embedding(code1) |
| emb2 = get_embedding(code2) |
| |
| if emb1 is None or emb2 is None: |
| return None |
| |
| with torch.no_grad(): |
| similarity = F.cosine_similarity(emb1, emb2).item() |
| |
| return similarity |
|
|
| |
| st.title("🔍 Java Code Clone Detector (IJaDataset 2.1)") |
| st.markdown("Compare Java code snippets from the IJaDataset 2.1 using CodeBERT embeddings.") |
|
|
| |
| selected_pair = None |
| if dataset_pairs: |
| pair_options = {f"{i+1}: {pair['type']}": pair for i, pair in enumerate(dataset_pairs)} |
| selected_option = st.selectbox("Select a preloaded example pair:", list(pair_options.keys())) |
| selected_pair = pair_options[selected_option] |
|
|
| |
| col1, col2 = st.columns(2) |
|
|
| with col1: |
| code1 = st.text_area( |
| "First Java Code", |
| height=300, |
| value=selected_pair["code1"] if selected_pair else "", |
| help="Enter the first Java code snippet" |
| ) |
|
|
| with col2: |
| code2 = st.text_area( |
| "Second Java Code", |
| height=300, |
| value=selected_pair["code2"] if selected_pair else "", |
| help="Enter the second Java code snippet" |
| ) |
|
|
| threshold = st.slider( |
| "Clone Detection Threshold", |
| min_value=0.50, |
| max_value=1.00, |
| value=0.75, |
| step=0.01, |
| help="Similarity score needed to consider code as cloned (0.5-1.0)" |
| ) |
|
|
| |
| if st.button("Compare Code"): |
| similarity = compare_code(code1, code2) |
| |
| if similarity is not None: |
| is_clone = similarity >= threshold |
| |
| st.subheader("Results") |
| cols = st.columns(3) |
| cols[0].metric("Similarity Score", f"{similarity:.3f}") |
| cols[1].metric("Current Threshold", f"{threshold:.3f}") |
| cols[2].metric( |
| "Verdict", |
| "✅ CLONE" if is_clone else "❌ NOT CLONE", |
| delta=f"{similarity-threshold:+.3f}", |
| help=f"Score {'≥' if is_clone else '<'} threshold" |
| ) |
| |
| st.progress(similarity) |
| |
| with st.expander("Interpretation Guide"): |
| st.markdown(""" |
| - **> 0.95**: Nearly identical (Type-1 clone) |
| - **0.85-0.95**: Very similar (Type-2 clone) |
| - **0.70-0.85**: Similar structure (Type-3 clone) |
| - **< 0.70**: Different code |
| """) |
|
|
| with st.expander("Show normalized code"): |
| tab1, tab2 = st.tabs(["First Code", "Second Code"]) |
| with tab1: |
| st.code(normalize_code(code1)) |
| with tab2: |
| st.code(normalize_code(code2)) |
|
|
| st.markdown("---") |
| st.markdown(""" |
| **Dataset Information**: |
| - Using IJaDataset 2.1 from Kaggle |
| - Contains 100K Java files with clone annotations |
| - Clone types: Type-1, Type-2, and Type-3 clones |
| """) |