dome / README.md

First commit with DOME model

0774a75 about 2 years ago

8.73 kB

	# DOME wrapper for docstring intent classification
	This wrapper allows to
	* split docstrings into sentences
	* convert to required DOME inputs
	* predict class for each sentence in docstring

	## Model architecture
	Architecture is based on https://github.com/ICSE-DOME/DOME.

	## Usage
	```python
	docstring = "sentences of docstring"
	dome = DOME("dome_location")
	sentences, predictions = dome.predict(docstring)
	```

	## Dependencies
	```
	spacy
	torch
	transformers
	```

	## Code of the model
	````python
	"""
	Model is based on replication package for ICSE23 Paper Developer-Intent Driven Code Comment Generation.
	Initial solution: https://github.com/ICSE-DOME/DOME
	Pipeline consists of several parts:
	* split docstring into sentences
	* prepare input data for DOMEBertForClassification
	* predict class

	How to use:
	```python
	docstring = "sentences of docstring"
	dome = DOME("dome_location")
	sentences, predictions = dome.predict(docstring)
	```
	"""
	import re
	from typing import Tuple, List

	import spacy
	import torch
	import torch.nn as nn
	import torch.nn.functional as F
	from transformers import AutoTokenizer, RobertaConfig, RobertaModel

	MAX_LENGTH_BERT = 510


	class DOME:
	"""
	End-to-end pipeline for docstring classification
	* split sentences
	* prepare inputs
	* classify
	"""
	def __init__(self, pretrained_model: str):
	"""
	:param pretrained_model: location of pretrained model
	"""
	self.model = DOMEBertForClassification.from_pretrained(pretrained_model)
	self.tokenizer = AutoTokenizer.from_pretrained(pretrained_model)
	self.docstring2sentences = Docstring2Sentences()

	def predict(self, docstring: str) -> Tuple[List[str], List[str]]:
	"""
	Predict DOME classes for each sentence in docstring.
	:param docstring: docstring to process
	:return: tuple with list of sentences and list of predictions for each sentence.
	"""
	sentences = self.docstring2sentences.docstring2sentences(docstring)
	predictions = [self.model.predict(*dome_preprocess(tokenizer=self.tokenizer, comment=sentence))
	for sentence in sentences]
	return sentences, predictions


	class DOMEBertForClassification(RobertaModel):
	"""
	A custom classification model based on the RobertaModel for intent classification.

	This model extends the RobertaModel with additional linear layers to incorporate
	comment length as an additional feature for classification tasks.
	"""

	DOME_CLASS_NAMES = ["what", "why", "how-to-use", "how-it-is-done", "property", "others"]

	def __init__(self, config: RobertaConfig):
	"""
	Initialize the DOMEBertForClassification model.

	:param config: The configuration information for the RobertaModel.
	"""
	super().__init__(config)

	# I omit possibility to configure number of classes and so on because we need to load pretrained model
	# DOME layers for intent classification:
	self.fc1 = nn.Linear(768 + 1, 768 // 3)
	self.fc2 = nn.Linear(768 // 3, 6)
	self.dropout = nn.Dropout(0.2)

	def forward(self, input_ids: torch.Tensor, attention_mask: torch.Tensor = None, comment_len: torch.Tensor = None) \
	-> torch.Tensor:
	"""
	Forward pass for the DOMEBertForClassification model.

	:param input_ids: Tensor of token ids to be fed to a model.
	:param attention_mask: Mask to avoid performing attention on padding token indices. Always equals 1.
	:param comment_len: Tensor representing the length of comments. Equal 1 if comment has less than 3 words,
	0 otherwise.
	:return: The logits after passing through the model.
	"""
	# Use the parent class's forward method to get the base outputs
	outputs = super().forward(
	input_ids=input_ids,
	attention_mask=attention_mask
	)
	# Extract the pooled output (last hidden state of the [CLS] token)
	pooled_output = outputs.pooler_output
	# DOME custom layers:
	comment_len = comment_len.view(-1, 1).float() # Ensure comment_len is correctly shaped
	# DOME use comment len as additional feature
	combined_input = torch.cat([pooled_output, comment_len], dim=-1)
	x = self.dropout(F.relu(self.fc1(self.dropout(combined_input))))
	logits = self.fc2(x)
	return logits

	def predict(self, input_ids: torch.Tensor, attention_mask: torch.Tensor = None, comment_len: torch.Tensor = None) \
	-> str:
	"""
	Predict class for tokenized docstring.

	:param input_ids: Tensor of token ids to be fed to a model.
	:param attention_mask: Mask to avoid performing attention on padding token indices. Always equals 1.
	:param comment_len: Tensor representing the length of comments. Equal 1 if comment has less than 3 words,
	0 otherwise.
	:return: class
	"""
	logits = self.forward(input_ids=input_ids, attention_mask=attention_mask, comment_len=comment_len)
	return self.DOME_CLASS_NAMES[int(torch.argmax(logits, 1))]


	def dome_preprocess(tokenizer, comment):
	"""
	DOME preprocessor - returns all required values for "DOMEBertForClassification.forward".
	This function limits maximum number of tokens to fit into BERT.
	:param tokenizer: tokenizer to use.
	:param comment: text of sentence from docstring/comment that should be classified by DOMEBertForClassification.
	:return: tuple with (input_ids, attention_mask, comment_len).
	"""
	input_ids = tokenizer.convert_tokens_to_ids([tokenizer.cls_token] + tokenizer.tokenize(comment) +
	[tokenizer.sep_token])[:MAX_LENGTH_BERT]
	attention_mask = [1] * len(input_ids)
	if len(comment.strip().split()) < 3:
	comment_len = 1
	else:
	comment_len = 0
	return (torch.tensor(input_ids).unsqueeze(0), torch.tensor(attention_mask).unsqueeze(0),
	torch.tensor(comment_len).unsqueeze(0))


	class Docstring2Sentences:
	"""Helper class to split docstrings into sentences"""
	def __init__(self):
	self.spacy_nlp = spacy.load("en_core_web_sm")

	@staticmethod
	def split_docstring(docstring: str, delimiters: List[Tuple[str, str]]):
	"""
	Splits the docstring into separate parts of text and code blocks, preserving the original formatting.

	:param docstring: The docstring to split.
	:param delimiters: A list of tuples, each containing start and end delimiters for code blocks.
	:return: A list of strings, each either a text block or a code block.
	"""

	# Escape delimiter parts for regex and create a combined pattern
	escaped_delimiters = [tuple(map(re.escape, d)) for d in delimiters]
	combined_pattern = '\|'.join([f'({start}.*?{end})' for start, end in escaped_delimiters])

	# Split using the combined pattern, preserving the delimiters
	parts = re.split(combined_pattern, docstring, flags=re.DOTALL)

	# Filter out empty strings
	parts = [part for part in parts if part]

	return parts

	@staticmethod
	def is_only_spaces_and_newlines(string):
	"""
	Check if the given string contains only spaces and newlines.

	:param string: The string to check.
	:return: True if the string contains only spaces and newlines, False otherwise.
	"""
	return bool(re.match(r'^[\s\n]+$', string))

	def docstring2sentences(self, docstring):
	"""
	Splits a docstring into individual sentences, preserving code blocks.

	This function uses `docstring2parts` to split the docstring into parts based on predefined code block delimiters.
	It then utilizes a SpaCy NLP model to split the non-code text parts into sentences.
	Code blocks are kept intact as single elements.

	:param docstring: The docstring to be processed, which may contain both regular text and code blocks.
	:return: A list containing individual sentences and intact code blocks.
	"""
	delimiters = [("@code", "@endcode"), ("\code", "\endcode")]
	parts = self.split_docstring(docstring=docstring, delimiters=delimiters)
	sentences = []
	for part in parts:
	if part[1:5] == "code" and part[-7:] == "endcode":
	# code block
	sentences.append(part)
	else:
	sentences.extend(sentence.text for sentence in self.spacy_nlp(part).sents)

	return [sentence for sentence in sentences if not self.is_only_spaces_and_newlines(sentence)]

	````