Rodrigo Ferreira Rodrigues commited on
Commit
0651b51
·
1 Parent(s): 937d2a1

Updating documentation

Browse files
Files changed (3) hide show
  1. README.md +31 -16
  2. regression_evaluate.py +14 -14
  3. tests.py +1 -1
README.md CHANGED
@@ -14,37 +14,52 @@ pinned: false
14
 
15
  # Metric Card for regression_evaluate
16
 
17
- ***Module Card Instructions:*** *Fill out the following subsections. Feel free to take a look at existing metric cards if you'd like examples.*
18
-
19
  ## Metric Description
20
- *Give a brief overview of this metric, including what task(s) it is usually used for, if any.*
 
21
 
22
  ## How to Use
23
- *Give general statement of how to use the metric*
24
 
25
- *Provide simplest possible example for using the metric*
 
 
 
 
 
 
 
 
 
 
26
 
27
- ### Inputs
28
- *List all input arguments in the format below*
29
- - **input_field** *(type): Definition of input, with explanation if necessary. State any default value(s).*
30
 
31
  ### Output Values
32
 
33
- *Explain what this metric outputs and provide an example of what the metric output looks like. Modules should return a dictionary with one or multiple key-value pairs, e.g. {"bleu" : 6.02}*
34
 
35
- *State the range of possible values that the metric's output can take, as well as what in that range is considered good. For example: "This metric can take on any value between 0 and 100, inclusive. Higher scores are better."*
 
 
 
 
 
 
36
 
37
  #### Values from Popular Papers
38
- *Give examples, preferrably with links to leaderboards or publications, to papers that have reported this metric, along with the values they have reported.*
39
 
40
  ### Examples
41
- *Give code examples of the metric being used. Try to include examples that clear up any potential ambiguity left from the metric description above. If possible, provide a range of examples that show both typical and atypical results, as well as examples where a variety of input parameters are passed.*
 
 
 
 
 
 
 
42
 
43
  ## Limitations and Bias
44
- *Note any known limitations or biases that the metric has, with links and references if possible.*
45
 
46
  ## Citation
47
- *Cite the source where this metric was introduced.*
48
 
49
- ## Further References
50
- *Add any useful further references.*
 
14
 
15
  # Metric Card for regression_evaluate
16
 
 
 
17
  ## Metric Description
18
+
19
+ This metric aims to evaluate regression tasks done by LMs. It expects the model to generate a list of numerical values to compare it to gold list of numerical values.
20
 
21
  ## How to Use
 
22
 
23
+ This metric takes 2 mandatory arguments : `generations` (a list of string), `golds` (a list of list of floats).
24
+
25
+ ```python
26
+ import evaluate
27
+ metric = evaluate.load("rfr2003/regression_evaluate")
28
+ results = metric.compute(generations=['[150, 0]'], golds=[183, 177, 146, 85, 70, 78, 55, 17, 0, -1, -1])
29
+ print(results)
30
+ {'precision': [4.0], 'recall': [344.0], 'macro-mean': [174.0], 'median macro-mean': 174.0}
31
+ ```
32
+
33
+ This metric accepts one optional argument:
34
 
35
+ `d`: function used to compute the distance between a generated value and a gold one. The default value is a function computing the absolute difference between two numbers.
 
 
36
 
37
  ### Output Values
38
 
39
+ This metric outputs a dictionary with the following values:
40
 
41
+ `precision`: Sum of the minimum distances between each predicted value and the set of gold values, computed for each question.
42
+
43
+ `recall`: Sum of the minimum distances between each gold value and the set of generated values, computed for each question.
44
+
45
+ `macro-mean`: Average between precision and recall, computed for each question.
46
+
47
+ `median macro-mean`: Median accross macro-mean values.
48
 
49
  #### Values from Popular Papers
 
50
 
51
  ### Examples
52
+
53
+ ```python
54
+ import evaluate
55
+ metric = evaluate.load("rfr2003/regression_evaluate")
56
+ results = metric.compute(generations=['[150, 0]'], golds=[183, 177, 146, 85, 70, 78, 55, 17, 0, -1, -1])
57
+ print(results)
58
+ {'precision': [4.0], 'recall': [344.0], 'macro-mean': [174.0], 'median macro-mean': 174.0}
59
+ ```
60
 
61
  ## Limitations and Bias
 
62
 
63
  ## Citation
 
64
 
65
+ ## Further References
 
regression_evaluate.py CHANGED
@@ -37,22 +37,25 @@ This metric aims to evaluate regression tasks done by LMs.
37
 
38
  # TODO: Add description of the arguments of the module here
39
  _KWARGS_DESCRIPTION = """
40
- Calculates Accuracy and Blue-1 between generations and gold answers in a MCQ context.
41
  Args:
42
  generations: list of predictions to score. Each predictions
43
  should be a string generated by a LM model.
44
  golds: list of reference for each prediction. Each
45
  reference should be a list of floats.
 
46
  Returns:
47
- precision: ,
48
- recall:
 
 
49
  Examples:
50
  Here is an exemple on how to use the metric:
51
 
52
  >>> metric = evaluate.load("rfr2003/regression_evaluate")
53
  >>> results = metric.compute(generations=['[150, 0]'], golds=[183, 177, 146, 85, 70, 78, 55, 17, 0, -1, -1])
54
  >>> print(results)
55
- {'precision': 4.0, 'recall': 344.0, 'macro-mean': 174.0, 'median macro-mean': 174.0}
56
  """
57
 
58
 
@@ -91,13 +94,13 @@ class regression_evaluate(evaluate.Metric):
91
  dists.append(g_dist)
92
 
93
  dists = np.array(dists)
94
- precision = np.min(dists, axis=0).sum()
95
 
96
- recall = np.min(dists, axis=1).sum()
97
 
98
  return precision, recall
99
 
100
- def _compute(self, generations, golds):
101
  assert len(generations) == len(golds)
102
  assert isinstance(golds, list)
103
 
@@ -110,21 +113,18 @@ class regression_evaluate(evaluate.Metric):
110
 
111
  f_ans = list(set([float(a.replace(',', '')) for a in f_ans])) #get rid of duples values
112
 
113
- precision, recall = self._calculate_pre_rec(f_ans, f_gold, lambda x,y: abs(x-y))
114
 
115
  precisions.append(precision)
116
  recalls.append(recall)
117
  means_pre_rec.append((precision+recall)/2)
118
 
119
 
120
- macro_prec = np.mean(precisions).item()
121
- macro_rec = np.mean(recalls).item()
122
-
123
  metrics = {}
124
  metrics.update({
125
- 'precision': np.mean(precisions).item(),
126
- 'recall': np.mean(recalls).item(),
127
- 'macro-mean': np.mean(means_pre_rec).item(),
128
  'median macro-mean': median(means_pre_rec)
129
  })
130
 
 
37
 
38
  # TODO: Add description of the arguments of the module here
39
  _KWARGS_DESCRIPTION = """
40
+ Calculates Precision, recall and macro-mean between generations and gold answers in a regression context.
41
  Args:
42
  generations: list of predictions to score. Each predictions
43
  should be a string generated by a LM model.
44
  golds: list of reference for each prediction. Each
45
  reference should be a list of floats.
46
+ d: function used to compute the distance between a generated value and a gold one.
47
  Returns:
48
+ precision: Sum of the minimum distances between each predicted value and the set of gold values, computed for each question.
49
+ recall: Sum of the minimum distances between each gold value and the set of generated values, computed for each question.
50
+ macro-mean: Average between precision and recall, computed for each question.
51
+ median macro-mean: Median accross macro-mean values.
52
  Examples:
53
  Here is an exemple on how to use the metric:
54
 
55
  >>> metric = evaluate.load("rfr2003/regression_evaluate")
56
  >>> results = metric.compute(generations=['[150, 0]'], golds=[183, 177, 146, 85, 70, 78, 55, 17, 0, -1, -1])
57
  >>> print(results)
58
+ {'precision': [4.0], 'recall': [344.0], 'macro-mean': [174.0], 'median macro-mean': 174.0}
59
  """
60
 
61
 
 
94
  dists.append(g_dist)
95
 
96
  dists = np.array(dists)
97
+ precision = np.min(dists, axis=0).sum().item()
98
 
99
+ recall = np.min(dists, axis=1).sum().item()
100
 
101
  return precision, recall
102
 
103
+ def _compute(self, generations, golds, d=lambda x,y: abs(x-y)):
104
  assert len(generations) == len(golds)
105
  assert isinstance(golds, list)
106
 
 
113
 
114
  f_ans = list(set([float(a.replace(',', '')) for a in f_ans])) #get rid of duples values
115
 
116
+ precision, recall = self._calculate_pre_rec(f_ans, f_gold, d)
117
 
118
  precisions.append(precision)
119
  recalls.append(recall)
120
  means_pre_rec.append((precision+recall)/2)
121
 
122
 
 
 
 
123
  metrics = {}
124
  metrics.update({
125
+ 'precision': precisions,
126
+ 'recall': recalls,
127
+ 'macro-mean': means_pre_rec,
128
  'median macro-mean': median(means_pre_rec)
129
  })
130
 
tests.py CHANGED
@@ -2,6 +2,6 @@ test_cases = [
2
  {
3
  'generations': ['[150, 0]'],
4
  'golds': [183, 177, 146, 85, 70, 78, 55, 17, 0, -1, -1],
5
- "result": {'precision': 4.0, 'recall': 344.0, 'macro-mean': 174.0, 'median macro-mean': 174.0}
6
  }
7
  ]
 
2
  {
3
  'generations': ['[150, 0]'],
4
  'golds': [183, 177, 146, 85, 70, 78, 55, 17, 0, -1, -1],
5
+ "result": {'precision': [4.0], 'recall': [344.0], 'macro-mean': [174.0], 'median macro-mean': 174.0}
6
  }
7
  ]