CoCalc -- grpo_length.py

Real-time collaboration for Jupyter Notebooks, Linux Terminals, LaTeX, VS Code, R IDE, and more,
all in one place. Commercial Alternative to JupyterHub.

GitHub Repository: huggingface/notebooks
Path: blob/main/course/en/chapter13/grpo_length.py
Views: ²⁹³⁵

1
import marimo
2

3
__generated_with = "0.10.6"
4
app = marimo.App(width="medium")
5

6

7
@app.cell(hide_code=True)
8
def _():
9
    import marimo as mo
10

11
    mo.md(
12
        "## Length based reward\nAdjust the slider to see how the reward changes for different completion lengths."
13
    )
14
    return (mo,)
15

16

17
@app.cell(hide_code=True)
18
def _(mo):
19
    slider = mo.ui.slider(start=5, stop=50, step=5, label="Ideal Length (characters)")
20
    slider
21
    return (slider,)
22

23

24
@app.cell(hide_code=True)
25
def _(mo, slider):
26
    import plotly.express as px
27

28
    # Toy dataset with 5 samples of different lengths
29
    completions = [
30
        "Short",  # 5 chars
31
        "Medium length text",  # 18 chars
32
        "This is about twenty chars",  # 25 chars
33
        "This is a slightly longer completion",  # 36 chars
34
        "This is a much longer completion with more words",  # 45 chars
35
    ]
36

37
    maximum_length = max(len(completion) for completion in completions)
38
    minimum_length = min(len(completion) for completion in completions)
39

40
    def length_reward(completions, ideal_length):
41
        """
42
        Calculate rewards based on the length of completions.
43

44
        Args:
45
            completions: List of text completions
46
            ideal_length: Target length in characters
47

48
        Returns:
49
            List of reward scores for each completion
50
        """
51
        rewards = []
52

53
        for completion in completions:
54
            length = len(completion)
55
            # Simple reward function: negative absolute difference
56
            reward = maximum_length - abs(length - ideal_length)
57
            reward = max(0, reward)
58
            reward = min(1, reward / (maximum_length - minimum_length))
59
            rewards.append(reward)
60

61
        return rewards
62

63
    # Calculate rewards for the examples
64
    rewards = length_reward(completions=completions, ideal_length=slider.value)
65

66
    # Display the examples and their rewards
67
    results = []
68
    for completion, reward in zip(completions, rewards):
69
        results.append(
70
            {"Completion": completion, "Length": len(completion), "Reward": reward}
71
        )
72

73
    fig = px.bar(results, x="Completion", y="Reward", color="Length")
74
    mo.ui.plotly(fig)
75

76

77
if __name__ == "__main__":
78
    app.run()
79

80

Real-time collaboration for Jupyter Notebooks, Linux Terminals, LaTeX, VS Code, R IDE, and more,
all in one place. Commercial Alternative to JupyterHub.

Product

Resources

Company

Real-time collaboration for Jupyter Notebooks, Linux Terminals, LaTeX, VS Code, R IDE, and more, all in one place. Commercial Alternative to JupyterHub.

Real-time collaboration for Jupyter Notebooks, Linux Terminals, LaTeX, VS Code, R IDE, and more,
all in one place. Commercial Alternative to JupyterHub.