Spaces:
Sleeping
Sleeping
File size: 1,668 Bytes
ec4ae03 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 | @startuml reward_system
!theme plain
top to bottom direction
skinparam backgroundColor #FEFEFE
skinparam defaultFontName Arial
skinparam defaultFontSize 14
skinparam ArrowColor #334155
skinparam RectangleBorderColor #64748B
skinparam RectangleFontColor #0F172A
skinparam roundcorner 10
skinparam linetype ortho
skinparam packageStyle rectangle
skinparam nodesep 54
skinparam ranksep 60
title AxiomForgeAI - Reward System
rectangle "Sampled Solution Attempt" as ATTEMPT #DBEAFE
rectangle "Grounded Reward\nknown-answer problem" as GROUNDED #ECFDF5 {
rectangle "Final answer\nmatches gold" as GOLD #CCFBF1
rectangle "PRM process score\nreasoning quality" as GPRM #CCFBF1
rectangle "Chain consistency\ncorrect prefix + final check" as GCHAIN #CCFBF1
rectangle "Format score\nparseable final answer" as GFORMAT #CCFBF1
}
rectangle "Self-Play Reward\ngenerated challenge" as SELFPLAY #EEF2FF {
rectangle "Question quality\nclarity, novelty, solvability" as QUALITY #E0E7FF
rectangle "Solution quality\nPRM + chain checks" as SOLUTION #E0E7FF
rectangle "Format score\nparseable final answer" as SFORMAT #E0E7FF
}
rectangle "Combined Reward\none score per attempt" as SCORE #F1F5F9
rectangle "GRPO Group Comparison\nrank attempts within the same problem" as COMPARE #EDE9FE
rectangle "Step-Answer Alignment\nreward paths where reasoning supports the result" as ALIGN #DDD6FE
ATTEMPT -left-> GROUNDED : grounded
ATTEMPT -right-> SELFPLAY : self-play
GOLD --> GPRM
GPRM --> GCHAIN
GCHAIN --> GFORMAT
QUALITY --> SOLUTION
SOLUTION --> SFORMAT
GFORMAT -down-> SCORE
SFORMAT -down-> SCORE
SCORE -right-> COMPARE
COMPARE -right-> ALIGN
@enduml
|