| import streamlit as st |
| import javalang |
| import torch |
| import torch.nn.functional as F |
| import re |
| from transformers import AutoTokenizer, AutoModel |
| import warnings |
|
|
| |
| st.set_page_config( |
| page_title="Java Code Clone Detector", |
| page_icon="🔍", |
| layout="wide" |
| ) |
|
|
| |
| warnings.filterwarnings("ignore") |
|
|
| |
| MODEL_NAME = "microsoft/codebert-base" |
| MAX_LENGTH = 512 |
| DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu') |
|
|
| |
| @st.cache_resource |
| def load_models(): |
| try: |
| tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME) |
| model = AutoModel.from_pretrained(MODEL_NAME).to(DEVICE) |
| return tokenizer, model |
| except Exception as e: |
| st.error(f"Failed to load models: {str(e)}") |
| return None, None |
|
|
| tokenizer, code_model = load_models() |
|
|
| |
| st.title("🔍 Java Code Clone Detector") |
| st.markdown(""" |
| Compare two Java code snippets to detect potential clones using CodeBERT embeddings. |
| The similarity score ranges from 0 (completely different) to 1 (identical). |
| """) |
|
|
| |
| EXAMPLE_1 = """public class Hello { |
| public static void main(String[] args) { |
| System.out.println("Hello, World!"); |
| } |
| }""" |
|
|
| EXAMPLE_2 = """public class Greet { |
| public static void main(String[] args) { |
| System.out.println("Hello, World!"); |
| } |
| }""" |
|
|
| |
| col1, col2 = st.columns(2) |
|
|
| with col1: |
| code1 = st.text_area( |
| "First Java Code", |
| height=300, |
| value=EXAMPLE_1, |
| help="Enter the first Java code snippet" |
| ) |
|
|
| with col2: |
| code2 = st.text_area( |
| "Second Java Code", |
| height=300, |
| value=EXAMPLE_2, |
| help="Enter the second Java code snippet" |
| ) |
|
|
| |
| threshold = st.slider( |
| "Clone Detection Threshold", |
| min_value=0.5, |
| max_value=1.0, |
| value=0.85, |
| step=0.01, |
| help="Adjust the similarity threshold for clone detection" |
| ) |
|
|
| |
| def normalize_code(code): |
| try: |
| code = re.sub(r'//.*', '', code) |
| code = re.sub(r'/\*.*?\*/', '', code, flags=re.DOTALL) |
| code = re.sub(r'\s+', ' ', code).strip() |
| return code |
| except Exception: |
| return code |
|
|
| |
| def get_embedding(code): |
| try: |
| code = normalize_code(code) |
| inputs = tokenizer( |
| code, |
| return_tensors="pt", |
| truncation=True, |
| max_length=MAX_LENGTH, |
| padding='max_length' |
| ).to(DEVICE) |
| |
| with torch.no_grad(): |
| outputs = code_model(**inputs) |
| |
| return outputs.last_hidden_state.mean(dim=1) |
| except Exception as e: |
| st.error(f"Error processing code: {str(e)}") |
| return None |
|
|
| |
| def compare_code(code1, code2): |
| if not code1 or not code2: |
| return None |
| |
| with st.spinner('Analyzing code...'): |
| emb1 = get_embedding(code1) |
| emb2 = get_embedding(code2) |
| |
| if emb1 is None or emb2 is None: |
| return None |
| |
| with torch.no_grad(): |
| similarity = F.cosine_similarity(emb1, emb2).item() |
| |
| return similarity |
|
|
| |
| if st.button("Compare Code", type="primary"): |
| if tokenizer is None or code_model is None: |
| st.error("Models failed to load. Please check the logs.") |
| else: |
| similarity = compare_code(code1, code2) |
| |
| if similarity is not None: |
| |
| st.subheader("Results") |
| |
| |
| st.progress(similarity) |
| |
| |
| col1, col2, col3 = st.columns(3) |
| |
| with col1: |
| st.metric("Similarity Score", f"{similarity:.3f}") |
| |
| with col2: |
| st.metric("Threshold", f"{threshold:.3f}") |
| |
| with col3: |
| is_clone = similarity >= threshold |
| st.metric( |
| "Clone Detection", |
| "✅ Clone" if is_clone else "❌ Not a Clone", |
| delta=f"{similarity-threshold:+.3f}" |
| ) |
| |
| |
| if similarity > 0.95: |
| st.success("The code snippets are nearly identical (potential Type-1 clone)") |
| elif similarity > 0.85: |
| st.success("The code snippets are very similar (potential Type-2 clone)") |
| elif similarity > 0.7: |
| st.warning("The code snippets show some similarity (potential Type-3 clone)") |
| else: |
| st.info("The code snippets are significantly different") |
| |
| |
| with st.expander("Show normalized code"): |
| tab1, tab2 = st.tabs(["First Code", "Second Code"]) |
| |
| with tab1: |
| st.code(normalize_code(code1)) |
| |
| with tab2: |
| st.code(normalize_code(code2)) |
|
|
| |
| st.markdown("---") |
| st.markdown(""" |
| **How it works**: |
| 1. Code is normalized (comments removed, whitespace standardized) |
| 2. CodeBERT generates embeddings for each snippet |
| 3. Cosine similarity is calculated between embeddings |
| 4. Results are compared against your threshold |
| """) |