Spaces:
Sleeping
Sleeping
Rodrigo Ferreira Rodrigues commited on
Commit ·
0651b51
1
Parent(s): 937d2a1
Updating documentation
Browse files- README.md +31 -16
- regression_evaluate.py +14 -14
- tests.py +1 -1
README.md
CHANGED
|
@@ -14,37 +14,52 @@ pinned: false
|
|
| 14 |
|
| 15 |
# Metric Card for regression_evaluate
|
| 16 |
|
| 17 |
-
***Module Card Instructions:*** *Fill out the following subsections. Feel free to take a look at existing metric cards if you'd like examples.*
|
| 18 |
-
|
| 19 |
## Metric Description
|
| 20 |
-
|
|
|
|
| 21 |
|
| 22 |
## How to Use
|
| 23 |
-
*Give general statement of how to use the metric*
|
| 24 |
|
| 25 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 26 |
|
| 27 |
-
|
| 28 |
-
*List all input arguments in the format below*
|
| 29 |
-
- **input_field** *(type): Definition of input, with explanation if necessary. State any default value(s).*
|
| 30 |
|
| 31 |
### Output Values
|
| 32 |
|
| 33 |
-
|
| 34 |
|
| 35 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 36 |
|
| 37 |
#### Values from Popular Papers
|
| 38 |
-
*Give examples, preferrably with links to leaderboards or publications, to papers that have reported this metric, along with the values they have reported.*
|
| 39 |
|
| 40 |
### Examples
|
| 41 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 42 |
|
| 43 |
## Limitations and Bias
|
| 44 |
-
*Note any known limitations or biases that the metric has, with links and references if possible.*
|
| 45 |
|
| 46 |
## Citation
|
| 47 |
-
*Cite the source where this metric was introduced.*
|
| 48 |
|
| 49 |
-
## Further References
|
| 50 |
-
*Add any useful further references.*
|
|
|
|
| 14 |
|
| 15 |
# Metric Card for regression_evaluate
|
| 16 |
|
|
|
|
|
|
|
| 17 |
## Metric Description
|
| 18 |
+
|
| 19 |
+
This metric aims to evaluate regression tasks done by LMs. It expects the model to generate a list of numerical values to compare it to gold list of numerical values.
|
| 20 |
|
| 21 |
## How to Use
|
|
|
|
| 22 |
|
| 23 |
+
This metric takes 2 mandatory arguments : `generations` (a list of string), `golds` (a list of list of floats).
|
| 24 |
+
|
| 25 |
+
```python
|
| 26 |
+
import evaluate
|
| 27 |
+
metric = evaluate.load("rfr2003/regression_evaluate")
|
| 28 |
+
results = metric.compute(generations=['[150, 0]'], golds=[183, 177, 146, 85, 70, 78, 55, 17, 0, -1, -1])
|
| 29 |
+
print(results)
|
| 30 |
+
{'precision': [4.0], 'recall': [344.0], 'macro-mean': [174.0], 'median macro-mean': 174.0}
|
| 31 |
+
```
|
| 32 |
+
|
| 33 |
+
This metric accepts one optional argument:
|
| 34 |
|
| 35 |
+
`d`: function used to compute the distance between a generated value and a gold one. The default value is a function computing the absolute difference between two numbers.
|
|
|
|
|
|
|
| 36 |
|
| 37 |
### Output Values
|
| 38 |
|
| 39 |
+
This metric outputs a dictionary with the following values:
|
| 40 |
|
| 41 |
+
`precision`: Sum of the minimum distances between each predicted value and the set of gold values, computed for each question.
|
| 42 |
+
|
| 43 |
+
`recall`: Sum of the minimum distances between each gold value and the set of generated values, computed for each question.
|
| 44 |
+
|
| 45 |
+
`macro-mean`: Average between precision and recall, computed for each question.
|
| 46 |
+
|
| 47 |
+
`median macro-mean`: Median accross macro-mean values.
|
| 48 |
|
| 49 |
#### Values from Popular Papers
|
|
|
|
| 50 |
|
| 51 |
### Examples
|
| 52 |
+
|
| 53 |
+
```python
|
| 54 |
+
import evaluate
|
| 55 |
+
metric = evaluate.load("rfr2003/regression_evaluate")
|
| 56 |
+
results = metric.compute(generations=['[150, 0]'], golds=[183, 177, 146, 85, 70, 78, 55, 17, 0, -1, -1])
|
| 57 |
+
print(results)
|
| 58 |
+
{'precision': [4.0], 'recall': [344.0], 'macro-mean': [174.0], 'median macro-mean': 174.0}
|
| 59 |
+
```
|
| 60 |
|
| 61 |
## Limitations and Bias
|
|
|
|
| 62 |
|
| 63 |
## Citation
|
|
|
|
| 64 |
|
| 65 |
+
## Further References
|
|
|
regression_evaluate.py
CHANGED
|
@@ -37,22 +37,25 @@ This metric aims to evaluate regression tasks done by LMs.
|
|
| 37 |
|
| 38 |
# TODO: Add description of the arguments of the module here
|
| 39 |
_KWARGS_DESCRIPTION = """
|
| 40 |
-
Calculates
|
| 41 |
Args:
|
| 42 |
generations: list of predictions to score. Each predictions
|
| 43 |
should be a string generated by a LM model.
|
| 44 |
golds: list of reference for each prediction. Each
|
| 45 |
reference should be a list of floats.
|
|
|
|
| 46 |
Returns:
|
| 47 |
-
precision:
|
| 48 |
-
recall:
|
|
|
|
|
|
|
| 49 |
Examples:
|
| 50 |
Here is an exemple on how to use the metric:
|
| 51 |
|
| 52 |
>>> metric = evaluate.load("rfr2003/regression_evaluate")
|
| 53 |
>>> results = metric.compute(generations=['[150, 0]'], golds=[183, 177, 146, 85, 70, 78, 55, 17, 0, -1, -1])
|
| 54 |
>>> print(results)
|
| 55 |
-
{'precision': 4.0, 'recall': 344.0, 'macro-mean': 174.0, 'median macro-mean': 174.0}
|
| 56 |
"""
|
| 57 |
|
| 58 |
|
|
@@ -91,13 +94,13 @@ class regression_evaluate(evaluate.Metric):
|
|
| 91 |
dists.append(g_dist)
|
| 92 |
|
| 93 |
dists = np.array(dists)
|
| 94 |
-
precision = np.min(dists, axis=0).sum()
|
| 95 |
|
| 96 |
-
recall = np.min(dists, axis=1).sum()
|
| 97 |
|
| 98 |
return precision, recall
|
| 99 |
|
| 100 |
-
def _compute(self, generations, golds):
|
| 101 |
assert len(generations) == len(golds)
|
| 102 |
assert isinstance(golds, list)
|
| 103 |
|
|
@@ -110,21 +113,18 @@ class regression_evaluate(evaluate.Metric):
|
|
| 110 |
|
| 111 |
f_ans = list(set([float(a.replace(',', '')) for a in f_ans])) #get rid of duples values
|
| 112 |
|
| 113 |
-
precision, recall = self._calculate_pre_rec(f_ans, f_gold,
|
| 114 |
|
| 115 |
precisions.append(precision)
|
| 116 |
recalls.append(recall)
|
| 117 |
means_pre_rec.append((precision+recall)/2)
|
| 118 |
|
| 119 |
|
| 120 |
-
macro_prec = np.mean(precisions).item()
|
| 121 |
-
macro_rec = np.mean(recalls).item()
|
| 122 |
-
|
| 123 |
metrics = {}
|
| 124 |
metrics.update({
|
| 125 |
-
'precision':
|
| 126 |
-
'recall':
|
| 127 |
-
'macro-mean':
|
| 128 |
'median macro-mean': median(means_pre_rec)
|
| 129 |
})
|
| 130 |
|
|
|
|
| 37 |
|
| 38 |
# TODO: Add description of the arguments of the module here
|
| 39 |
_KWARGS_DESCRIPTION = """
|
| 40 |
+
Calculates Precision, recall and macro-mean between generations and gold answers in a regression context.
|
| 41 |
Args:
|
| 42 |
generations: list of predictions to score. Each predictions
|
| 43 |
should be a string generated by a LM model.
|
| 44 |
golds: list of reference for each prediction. Each
|
| 45 |
reference should be a list of floats.
|
| 46 |
+
d: function used to compute the distance between a generated value and a gold one.
|
| 47 |
Returns:
|
| 48 |
+
precision: Sum of the minimum distances between each predicted value and the set of gold values, computed for each question.
|
| 49 |
+
recall: Sum of the minimum distances between each gold value and the set of generated values, computed for each question.
|
| 50 |
+
macro-mean: Average between precision and recall, computed for each question.
|
| 51 |
+
median macro-mean: Median accross macro-mean values.
|
| 52 |
Examples:
|
| 53 |
Here is an exemple on how to use the metric:
|
| 54 |
|
| 55 |
>>> metric = evaluate.load("rfr2003/regression_evaluate")
|
| 56 |
>>> results = metric.compute(generations=['[150, 0]'], golds=[183, 177, 146, 85, 70, 78, 55, 17, 0, -1, -1])
|
| 57 |
>>> print(results)
|
| 58 |
+
{'precision': [4.0], 'recall': [344.0], 'macro-mean': [174.0], 'median macro-mean': 174.0}
|
| 59 |
"""
|
| 60 |
|
| 61 |
|
|
|
|
| 94 |
dists.append(g_dist)
|
| 95 |
|
| 96 |
dists = np.array(dists)
|
| 97 |
+
precision = np.min(dists, axis=0).sum().item()
|
| 98 |
|
| 99 |
+
recall = np.min(dists, axis=1).sum().item()
|
| 100 |
|
| 101 |
return precision, recall
|
| 102 |
|
| 103 |
+
def _compute(self, generations, golds, d=lambda x,y: abs(x-y)):
|
| 104 |
assert len(generations) == len(golds)
|
| 105 |
assert isinstance(golds, list)
|
| 106 |
|
|
|
|
| 113 |
|
| 114 |
f_ans = list(set([float(a.replace(',', '')) for a in f_ans])) #get rid of duples values
|
| 115 |
|
| 116 |
+
precision, recall = self._calculate_pre_rec(f_ans, f_gold, d)
|
| 117 |
|
| 118 |
precisions.append(precision)
|
| 119 |
recalls.append(recall)
|
| 120 |
means_pre_rec.append((precision+recall)/2)
|
| 121 |
|
| 122 |
|
|
|
|
|
|
|
|
|
|
| 123 |
metrics = {}
|
| 124 |
metrics.update({
|
| 125 |
+
'precision': precisions,
|
| 126 |
+
'recall': recalls,
|
| 127 |
+
'macro-mean': means_pre_rec,
|
| 128 |
'median macro-mean': median(means_pre_rec)
|
| 129 |
})
|
| 130 |
|
tests.py
CHANGED
|
@@ -2,6 +2,6 @@ test_cases = [
|
|
| 2 |
{
|
| 3 |
'generations': ['[150, 0]'],
|
| 4 |
'golds': [183, 177, 146, 85, 70, 78, 55, 17, 0, -1, -1],
|
| 5 |
-
"result": {'precision': 4.0, 'recall': 344.0, 'macro-mean': 174.0, 'median macro-mean': 174.0}
|
| 6 |
}
|
| 7 |
]
|
|
|
|
| 2 |
{
|
| 3 |
'generations': ['[150, 0]'],
|
| 4 |
'golds': [183, 177, 146, 85, 70, 78, 55, 17, 0, -1, -1],
|
| 5 |
+
"result": {'precision': [4.0], 'recall': [344.0], 'macro-mean': [174.0], 'median macro-mean': 174.0}
|
| 6 |
}
|
| 7 |
]
|