| # DOME wrapper for docstring intent classification |
| This wrapper allows to |
| * split docstrings into sentences |
| * convert to required DOME inputs |
| * predict class for each sentence in docstring |
|
|
| ## Model architecture |
| Architecture is based on https://github.com/ICSE-DOME/DOME. |
|
|
| ## Usage |
| ```python |
| docstring = "sentences of docstring" |
| dome = DOME("dome_location") |
| sentences, predictions = dome.predict(docstring) |
| ``` |
|
|
| ## Dependencies |
| ``` |
| spacy |
| torch |
| transformers |
| ``` |
|
|
| ## Code of the model |
| ````python |
| """ |
| Model is based on replication package for ICSE23 Paper Developer-Intent Driven Code Comment Generation. |
| Initial solution: https://github.com/ICSE-DOME/DOME |
| Pipeline consists of several parts: |
| * split docstring into sentences |
| * prepare input data for DOMEBertForClassification |
| * predict class |
| |
| How to use: |
| ```python |
| docstring = "sentences of docstring" |
| dome = DOME("dome_location") |
| sentences, predictions = dome.predict(docstring) |
| ``` |
| """ |
| import re |
| from typing import Tuple, List |
| |
| import spacy |
| import torch |
| import torch.nn as nn |
| import torch.nn.functional as F |
| from transformers import AutoTokenizer, RobertaConfig, RobertaModel |
| |
| MAX_LENGTH_BERT = 510 |
| |
| |
| class DOME: |
| """ |
| End-to-end pipeline for docstring classification |
| * split sentences |
| * prepare inputs |
| * classify |
| """ |
| def __init__(self, pretrained_model: str): |
| """ |
| :param pretrained_model: location of pretrained model |
| """ |
| self.model = DOMEBertForClassification.from_pretrained(pretrained_model) |
| self.tokenizer = AutoTokenizer.from_pretrained(pretrained_model) |
| self.docstring2sentences = Docstring2Sentences() |
| |
| def predict(self, docstring: str) -> Tuple[List[str], List[str]]: |
| """ |
| Predict DOME classes for each sentence in docstring. |
| :param docstring: docstring to process |
| :return: tuple with list of sentences and list of predictions for each sentence. |
| """ |
| sentences = self.docstring2sentences.docstring2sentences(docstring) |
| predictions = [self.model.predict(*dome_preprocess(tokenizer=self.tokenizer, comment=sentence)) |
| for sentence in sentences] |
| return sentences, predictions |
| |
| |
| class DOMEBertForClassification(RobertaModel): |
| """ |
| A custom classification model based on the RobertaModel for intent classification. |
| |
| This model extends the RobertaModel with additional linear layers to incorporate |
| comment length as an additional feature for classification tasks. |
| """ |
| |
| DOME_CLASS_NAMES = ["what", "why", "how-to-use", "how-it-is-done", "property", "others"] |
| |
| def __init__(self, config: RobertaConfig): |
| """ |
| Initialize the DOMEBertForClassification model. |
| |
| :param config: The configuration information for the RobertaModel. |
| """ |
| super().__init__(config) |
| |
| # I omit possibility to configure number of classes and so on because we need to load pretrained model |
| # DOME layers for intent classification: |
| self.fc1 = nn.Linear(768 + 1, 768 // 3) |
| self.fc2 = nn.Linear(768 // 3, 6) |
| self.dropout = nn.Dropout(0.2) |
| |
| def forward(self, input_ids: torch.Tensor, attention_mask: torch.Tensor = None, comment_len: torch.Tensor = None) \ |
| -> torch.Tensor: |
| """ |
| Forward pass for the DOMEBertForClassification model. |
| |
| :param input_ids: Tensor of token ids to be fed to a model. |
| :param attention_mask: Mask to avoid performing attention on padding token indices. Always equals 1. |
| :param comment_len: Tensor representing the length of comments. Equal 1 if comment has less than 3 words, |
| 0 otherwise. |
| :return: The logits after passing through the model. |
| """ |
| # Use the parent class's forward method to get the base outputs |
| outputs = super().forward( |
| input_ids=input_ids, |
| attention_mask=attention_mask |
| ) |
| # Extract the pooled output (last hidden state of the [CLS] token) |
| pooled_output = outputs.pooler_output |
| # DOME custom layers: |
| comment_len = comment_len.view(-1, 1).float() # Ensure comment_len is correctly shaped |
| # DOME use comment len as additional feature |
| combined_input = torch.cat([pooled_output, comment_len], dim=-1) |
| x = self.dropout(F.relu(self.fc1(self.dropout(combined_input)))) |
| logits = self.fc2(x) |
| return logits |
| |
| def predict(self, input_ids: torch.Tensor, attention_mask: torch.Tensor = None, comment_len: torch.Tensor = None) \ |
| -> str: |
| """ |
| Predict class for tokenized docstring. |
| |
| :param input_ids: Tensor of token ids to be fed to a model. |
| :param attention_mask: Mask to avoid performing attention on padding token indices. Always equals 1. |
| :param comment_len: Tensor representing the length of comments. Equal 1 if comment has less than 3 words, |
| 0 otherwise. |
| :return: class |
| """ |
| logits = self.forward(input_ids=input_ids, attention_mask=attention_mask, comment_len=comment_len) |
| return self.DOME_CLASS_NAMES[int(torch.argmax(logits, 1))] |
| |
| |
| def dome_preprocess(tokenizer, comment): |
| """ |
| DOME preprocessor - returns all required values for "DOMEBertForClassification.forward". |
| This function limits maximum number of tokens to fit into BERT. |
| :param tokenizer: tokenizer to use. |
| :param comment: text of sentence from docstring/comment that should be classified by DOMEBertForClassification. |
| :return: tuple with (input_ids, attention_mask, comment_len). |
| """ |
| input_ids = tokenizer.convert_tokens_to_ids([tokenizer.cls_token] + tokenizer.tokenize(comment) + |
| [tokenizer.sep_token])[:MAX_LENGTH_BERT] |
| attention_mask = [1] * len(input_ids) |
| if len(comment.strip().split()) < 3: |
| comment_len = 1 |
| else: |
| comment_len = 0 |
| return (torch.tensor(input_ids).unsqueeze(0), torch.tensor(attention_mask).unsqueeze(0), |
| torch.tensor(comment_len).unsqueeze(0)) |
| |
| |
| class Docstring2Sentences: |
| """Helper class to split docstrings into sentences""" |
| def __init__(self): |
| self.spacy_nlp = spacy.load("en_core_web_sm") |
| |
| @staticmethod |
| def split_docstring(docstring: str, delimiters: List[Tuple[str, str]]): |
| """ |
| Splits the docstring into separate parts of text and code blocks, preserving the original formatting. |
| |
| :param docstring: The docstring to split. |
| :param delimiters: A list of tuples, each containing start and end delimiters for code blocks. |
| :return: A list of strings, each either a text block or a code block. |
| """ |
| |
| # Escape delimiter parts for regex and create a combined pattern |
| escaped_delimiters = [tuple(map(re.escape, d)) for d in delimiters] |
| combined_pattern = '|'.join([f'({start}.*?{end})' for start, end in escaped_delimiters]) |
| |
| # Split using the combined pattern, preserving the delimiters |
| parts = re.split(combined_pattern, docstring, flags=re.DOTALL) |
| |
| # Filter out empty strings |
| parts = [part for part in parts if part] |
| |
| return parts |
| |
| @staticmethod |
| def is_only_spaces_and_newlines(string): |
| """ |
| Check if the given string contains only spaces and newlines. |
| |
| :param string: The string to check. |
| :return: True if the string contains only spaces and newlines, False otherwise. |
| """ |
| return bool(re.match(r'^[\s\n]+$', string)) |
| |
| def docstring2sentences(self, docstring): |
| """ |
| Splits a docstring into individual sentences, preserving code blocks. |
| |
| This function uses `docstring2parts` to split the docstring into parts based on predefined code block delimiters. |
| It then utilizes a SpaCy NLP model to split the non-code text parts into sentences. |
| Code blocks are kept intact as single elements. |
| |
| :param docstring: The docstring to be processed, which may contain both regular text and code blocks. |
| :return: A list containing individual sentences and intact code blocks. |
| """ |
| delimiters = [("@code", "@endcode"), ("\code", "\endcode")] |
| parts = self.split_docstring(docstring=docstring, delimiters=delimiters) |
| sentences = [] |
| for part in parts: |
| if part[1:5] == "code" and part[-7:] == "endcode": |
| # code block |
| sentences.append(part) |
| else: |
| sentences.extend(sentence.text for sentence in self.spacy_nlp(part).sents) |
| |
| return [sentence for sentence in sentences if not self.is_only_spaces_and_newlines(sentence)] |
| |
| ```` |
|
|