From 9fa8326d7725e1835249ed6fec9c441ae9c66083 Mon Sep 17 00:00:00 2001 From: Rob Letzler <22990670+rl-utility-man@users.noreply.github.com> Date: Sat, 19 Apr 2025 23:21:19 -0400 Subject: [PATCH 1/5] add swarm plot to the scatter documentation This is inspired by #5087 --- doc/python/line-and-scatter.md | 89 ++++++++++++++++++++++++++++++++++ 1 file changed, 89 insertions(+) diff --git a/doc/python/line-and-scatter.md b/doc/python/line-and-scatter.md index 9ddaad8aac7..ea3ca6ac4a1 100644 --- a/doc/python/line-and-scatter.md +++ b/doc/python/line-and-scatter.md @@ -284,6 +284,95 @@ fig.update_traces(textposition="bottom right") fig.show() ``` +### Swarm (or Beeswarm) Plots + +Swarm plots show the distribution of values in a column by giving each entry one dot and adjusting the y-value so that dots do not overlap and appear symmetrically around the y=0 line. They complement histograms, box plots, and violin plots. This example could be generalized to implement a swarm plot for multiple categories by adjusting the y-coordinate for each category. + +```python +import pandas as pd +import plotly.express as px +import collections + + +def swarm( + X_series, + point_size=16, + fig_width = 800, + gap_multiplier=1.2, +): + #sorting will align columns in attractive arcs rather than having columns the vary unpredicatbly in the x-dimension + X_series=X_series.copy().sort_values() + + + # we need to reason in terms of the marker size that is measured in px + # so we need to think about each x-coordinate as being a fraction of the way from the + # minimum X value to the maximum X value + min_x = min(X_series) + max_x = max(X_series) + + list_of_rows = [] + # we will count the number of points in each "bin" / vertical strip of the graph + # to be able to assign a y-coordinate that avoids overlapping + bin_counter = collections.Counter() + + for x_val in X_series: + # assign this x_value to bin number + # each bin is a vertical strip wide enough for one marker + bin=(((fig_width*(x_val-min_x))/(max_x-min_x)) // point_size) + + #update the count of dots in that strip + bin_counter.update([bin]) + + # if this is an odd numbered entry in its bin, make its y coordinate negative + # the y coordinate of the first entry is 0, so entries 3, 5, and 7 get negative y coordinates + if bin_counter[bin]%2 == 1: + negative_1_if_count_is_odd = -1 + else: + negative_1_if_count_is_odd = 1 + + # the collision free y coordinate gives the items in a vertical bin + # coordinates: 0, 1, -1, 2, -2, 3, -3 ... and so on to evenly spread + # their locations above and below the y-axis (we'll make a correction below to deal with even numbers of entries) + # we then scale this by the point_size*gap_multiplier to get a y coordinate in px + + collision_free_y_coordinate=(bin_counter[bin]//2)*negative_1_if_count_is_odd*point_size*gap_multiplier + list_of_rows.append({"x":x_val,"y":collision_free_y_coordinate,"bin":bin}) + + # if the number of points is even, + # move y-coordinates down to put an equal number of entries above and below the axis + for row in list_of_rows: + if bin_counter[row["bin"]]%2==0: + row["y"]-=point_size*gap_multiplier/2 + + df = pd.DataFrame(list_of_rows) + + fig = px.scatter( + df, + x="x", + y="y", + hover_data="x", + ) + #we want to suppress the y coordinate in the hover value because the y-coordinate is irrelevant/misleading + fig.update_traces( + marker_size=point_size, + hovertemplate="value: %{x}", + ) + # we have to set the width and height because we aim to avoid icon collisions and we specify the icon size + # in the same units as the width and height + fig.update_layout(width=fig_width, height=(point_size*max(bin_counter.values())+200)) + fig.update_yaxes( + showticklabels=False, # Turn off y-axis labels + ticks='', # Remove the ticks + title="" + ) + fig.show() + + + +df_iris = px.data.iris() # iris is a pandas DataFrame +swarm(df_iris["sepal_length"]) +``` + ## Scatter and line plots with go.Scatter If Plotly Express does not provide a good starting point, it is possible to use [the more generic `go.Scatter` class from `plotly.graph_objects`](/python/graph-objects/). Whereas `plotly.express` has two functions `scatter` and `line`, `go.Scatter` can be used both for plotting points (makers) or lines, depending on the value of `mode`. The different options of `go.Scatter` are documented in its [reference page](https://plotly.com/python/reference/scatter/). From e563804f06e89aa9a62ecf615e68313ce62954c0 Mon Sep 17 00:00:00 2001 From: Rob Letzler <22990670+rl-utility-man@users.noreply.github.com> Date: Sat, 19 Apr 2025 23:26:05 -0400 Subject: [PATCH 2/5] ending with fig.show() --- doc/python/line-and-scatter.md | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/doc/python/line-and-scatter.md b/doc/python/line-and-scatter.md index ea3ca6ac4a1..6a741934aa1 100644 --- a/doc/python/line-and-scatter.md +++ b/doc/python/line-and-scatter.md @@ -365,12 +365,14 @@ def swarm( ticks='', # Remove the ticks title="" ) - fig.show() + return fig df_iris = px.data.iris() # iris is a pandas DataFrame -swarm(df_iris["sepal_length"]) +fig = swarm(df_iris["sepal_length"]) +fig.show() + ``` ## Scatter and line plots with go.Scatter From 15b758055fbce230628e74120a0847cccc4f26d1 Mon Sep 17 00:00:00 2001 From: Rob Letzler <22990670+rl-utility-man@users.noreply.github.com> Date: Mon, 21 Apr 2025 00:25:55 -0400 Subject: [PATCH 3/5] collision avoidance --- doc/python/line-and-scatter.md | 56 ++++++++++++++++++++++++---------- 1 file changed, 40 insertions(+), 16 deletions(-) diff --git a/doc/python/line-and-scatter.md b/doc/python/line-and-scatter.md index 6a741934aa1..de160d77452 100644 --- a/doc/python/line-and-scatter.md +++ b/doc/python/line-and-scatter.md @@ -293,12 +293,23 @@ import pandas as pd import plotly.express as px import collections +def negative_1_if_count_is_odd(count): + # if this is an odd numbered entry in its bin, make its y coordinate negative + # the y coordinate of the first entry is 0, so entries 3, 5, and 7 get negative y coordinates + if count%2 == 1: + return -1 + else: + return 1 + + + def swarm( X_series, point_size=16, fig_width = 800, gap_multiplier=1.2, + center_even_groups = False ): #sorting will align columns in attractive arcs rather than having columns the vary unpredicatbly in the x-dimension X_series=X_series.copy().sort_values() @@ -309,7 +320,7 @@ def swarm( # minimum X value to the maximum X value min_x = min(X_series) max_x = max(X_series) - + list_of_rows = [] # we will count the number of points in each "bin" / vertical strip of the graph # to be able to assign a y-coordinate that avoids overlapping @@ -319,33 +330,43 @@ def swarm( # assign this x_value to bin number # each bin is a vertical strip wide enough for one marker bin=(((fig_width*(x_val-min_x))/(max_x-min_x)) // point_size) - + #update the count of dots in that strip bin_counter.update([bin]) - - # if this is an odd numbered entry in its bin, make its y coordinate negative - # the y coordinate of the first entry is 0, so entries 3, 5, and 7 get negative y coordinates - if bin_counter[bin]%2 == 1: - negative_1_if_count_is_odd = -1 - else: - negative_1_if_count_is_odd = 1 + # the collision free y coordinate gives the items in a vertical bin # coordinates: 0, 1, -1, 2, -2, 3, -3 ... and so on to evenly spread # their locations above and below the y-axis (we'll make a correction below to deal with even numbers of entries) # we then scale this by the point_size*gap_multiplier to get a y coordinate in px - collision_free_y_coordinate=(bin_counter[bin]//2)*negative_1_if_count_is_odd*point_size*gap_multiplier - list_of_rows.append({"x":x_val,"y":collision_free_y_coordinate,"bin":bin}) + collision_free_y_coordinate=(bin_counter[bin]//2)*negative_1_if_count_is_odd(bin_counter[bin])*point_size*gap_multiplier + list_of_rows.append({"x":x_val,"y":collision_free_y_coordinate,"bin":bin, "adj":0}) # if the number of points is even, # move y-coordinates down to put an equal number of entries above and below the axis + #this can sometimes break the collision avoidance routine, but makes small N outputs look better otherwise + if center_even_groups: + for row in list_of_rows: + if bin_counter[row["bin"]]%2==0: + row["y"]-=point_size*gap_multiplier/2 + row["adj"]=-point_size*gap_multiplier/2 + + for row in list_of_rows: - if bin_counter[row["bin"]]%2==0: - row["y"]-=point_size*gap_multiplier/2 + bin = row["bin"] + #see if we need to "look left" to avoid a possible collision + for other_row in list_of_rows: + if (other_row["bin"]==bin-1 ): + if (((other_row["y"]==row["y"]) or (other_row["y"]==row["y"]+row["adj"])) + and (((fig_width*(row["x"]-other_row["x"]))/(max_x-min_x) // point_size) < 1)): + bin_counter.update([bin]) + row["y"]=(bin_counter[bin]//2)*negative_1_if_count_is_odd(bin_counter[bin])*point_size*gap_multiplier+row["adj"] + + df = pd.DataFrame(list_of_rows) - + fig = px.scatter( df, x="x", @@ -370,9 +391,12 @@ def swarm( df_iris = px.data.iris() # iris is a pandas DataFrame -fig = swarm(df_iris["sepal_length"]) +x = df_iris["sepal_length"] +x2 = pd.Series([5.05]) +x = pd.concat([x,x2], ignore_index=True) +fig = swarm(x) +#fig = swarm(pd.Series([1,1.5, 1.78, 1.79,2,2,12])) fig.show() - ``` ## Scatter and line plots with go.Scatter From 1e4d6b947d5787e7f514573f2d6b3018fdef2b28 Mon Sep 17 00:00:00 2001 From: Rob Letzler <22990670+rl-utility-man@users.noreply.github.com> Date: Fri, 2 May 2025 20:06:12 -0400 Subject: [PATCH 4/5] replaced a work around with a bug fix. --- doc/python/line-and-scatter.md | 43 +++++++++++++++++++--------------- 1 file changed, 24 insertions(+), 19 deletions(-) diff --git a/doc/python/line-and-scatter.md b/doc/python/line-and-scatter.md index de160d77452..d14d79df954 100644 --- a/doc/python/line-and-scatter.md +++ b/doc/python/line-and-scatter.md @@ -309,8 +309,8 @@ def swarm( point_size=16, fig_width = 800, gap_multiplier=1.2, - center_even_groups = False -): + bin_fraction=0.95, #bin fraction slightly undersizes the bins to avoid collisions + ): #sorting will align columns in attractive arcs rather than having columns the vary unpredicatbly in the x-dimension X_series=X_series.copy().sort_values() @@ -328,8 +328,9 @@ def swarm( for x_val in X_series: # assign this x_value to bin number - # each bin is a vertical strip wide enough for one marker - bin=(((fig_width*(x_val-min_x))/(max_x-min_x)) // point_size) + # each bin is a vertical strip slightly narrower than one marker + + bin=(((fig_width*bin_fraction*(x_val-min_x))/(max_x-min_x)) // point_size) #update the count of dots in that strip bin_counter.update([bin]) @@ -341,16 +342,8 @@ def swarm( # we then scale this by the point_size*gap_multiplier to get a y coordinate in px collision_free_y_coordinate=(bin_counter[bin]//2)*negative_1_if_count_is_odd(bin_counter[bin])*point_size*gap_multiplier - list_of_rows.append({"x":x_val,"y":collision_free_y_coordinate,"bin":bin, "adj":0}) + list_of_rows.append({"x":x_val,"y":collision_free_y_coordinate,"bin":bin}) - # if the number of points is even, - # move y-coordinates down to put an equal number of entries above and below the axis - #this can sometimes break the collision avoidance routine, but makes small N outputs look better otherwise - if center_even_groups: - for row in list_of_rows: - if bin_counter[row["bin"]]%2==0: - row["y"]-=point_size*gap_multiplier/2 - row["adj"]=-point_size*gap_multiplier/2 for row in list_of_rows: @@ -358,24 +351,39 @@ def swarm( #see if we need to "look left" to avoid a possible collision for other_row in list_of_rows: if (other_row["bin"]==bin-1 ): - if (((other_row["y"]==row["y"]) or (other_row["y"]==row["y"]+row["adj"])) + #"bubble" the entry up until we find a slot that avoids a collision + while ((other_row["y"]==row["y"]) and (((fig_width*(row["x"]-other_row["x"]))/(max_x-min_x) // point_size) < 1)): + print(row) + print(other_row) + print(((fig_width*(row["x"]-other_row["x"] ))/(max_x-min_x) // point_size)) + + print("updating to fix collision") bin_counter.update([bin]) - row["y"]=(bin_counter[bin]//2)*negative_1_if_count_is_odd(bin_counter[bin])*point_size*gap_multiplier+row["adj"] + print(bin_counter[bin]) + row["y"]=(bin_counter[bin]//2)*negative_1_if_count_is_odd(bin_counter[bin])*point_size*gap_multiplier + print(row["y"]) + # if the number of points is even, + # move y-coordinates down to put an equal number of entries above and below the axis + for row in list_of_rows: + if bin_counter[row["bin"]]%2==0: + row["y"]-=point_size*gap_multiplier/2 df = pd.DataFrame(list_of_rows) + # one way to make this code more flexible to e.g. handle multiple categories would be to return a list of "swarmified" y coordinates here + # you could then generate "swarmified" y coordinates for each category and add category specific offsets before scatterplotting them fig = px.scatter( df, x="x", y="y", - hover_data="x", ) #we want to suppress the y coordinate in the hover value because the y-coordinate is irrelevant/misleading fig.update_traces( marker_size=point_size, + #suppress the y coordinate because the y-coordinate is irrelevant hovertemplate="value: %{x}", ) # we have to set the width and height because we aim to avoid icon collisions and we specify the icon size @@ -392,10 +400,7 @@ def swarm( df_iris = px.data.iris() # iris is a pandas DataFrame x = df_iris["sepal_length"] -x2 = pd.Series([5.05]) -x = pd.concat([x,x2], ignore_index=True) fig = swarm(x) -#fig = swarm(pd.Series([1,1.5, 1.78, 1.79,2,2,12])) fig.show() ``` From 5469864eafbaa73eff467ba732e3a949ae589807 Mon Sep 17 00:00:00 2001 From: Rob Letzler <22990670+rl-utility-man@users.noreply.github.com> Date: Sat, 3 May 2025 23:25:20 -0400 Subject: [PATCH 5/5] maintain collision avoidance while arranging points in c-curves --- doc/python/line-and-scatter.md | 140 ++++++++++++++++++--------------- 1 file changed, 78 insertions(+), 62 deletions(-) diff --git a/doc/python/line-and-scatter.md b/doc/python/line-and-scatter.md index d14d79df954..e262cf60762 100644 --- a/doc/python/line-and-scatter.md +++ b/doc/python/line-and-scatter.md @@ -293,30 +293,32 @@ import pandas as pd import plotly.express as px import collections -def negative_1_if_count_is_odd(count): - # if this is an odd numbered entry in its bin, make its y coordinate negative - # the y coordinate of the first entry is 0, so entries 3, 5, and 7 get negative y coordinates - if count%2 == 1: - return -1 - else: - return 1 - +def negative_1_if_count_is_odd(count): + # if this is an odd numbered entry in its bin, make its y coordinate negative + # the y coordinate of the first entry is 0, so entries 3, 5, and 7 get + # negative y coordinates + if count % 2 == 1: + return -1 + else: + return 1 def swarm( X_series, point_size=16, - fig_width = 800, + fig_width=800, gap_multiplier=1.2, - bin_fraction=0.95, #bin fraction slightly undersizes the bins to avoid collisions - ): - #sorting will align columns in attractive arcs rather than having columns the vary unpredicatbly in the x-dimension - X_series=X_series.copy().sort_values() - + bin_fraction=0.95, # slightly undersizes the bins to avoid collisions +): + # sorting will align columns in attractive c-shaped arcs rather than having + # columns that vary unpredictably in the x-dimension. + # We also exploit the fact that sorting means we see bins sequentially when + # we add collision prevention offsets. + X_series = X_series.copy().sort_values() # we need to reason in terms of the marker size that is measured in px - # so we need to think about each x-coordinate as being a fraction of the way from the + # so we need to think about each x-coordinate as being a fraction of the way from the # minimum X value to the maximum X value min_x = min(X_series) max_x = max(X_series) @@ -329,79 +331,93 @@ def swarm( for x_val in X_series: # assign this x_value to bin number # each bin is a vertical strip slightly narrower than one marker - - bin=(((fig_width*bin_fraction*(x_val-min_x))/(max_x-min_x)) // point_size) + bin = (((fig_width*bin_fraction*(x_val-min_x))/(max_x-min_x)) // point_size) - #update the count of dots in that strip + # update the count of dots in that strip bin_counter.update([bin]) + # remember the "y-slot" which tells us the number of points in this bin and is sufficient to compute the y coordinate unless there's a collision with the point to its left + list_of_rows.append( + {"x": x_val, "y_slot": bin_counter[bin], "bin": bin}) - # the collision free y coordinate gives the items in a vertical bin - # coordinates: 0, 1, -1, 2, -2, 3, -3 ... and so on to evenly spread - # their locations above and below the y-axis (we'll make a correction below to deal with even numbers of entries) - # we then scale this by the point_size*gap_multiplier to get a y coordinate in px - - collision_free_y_coordinate=(bin_counter[bin]//2)*negative_1_if_count_is_odd(bin_counter[bin])*point_size*gap_multiplier - list_of_rows.append({"x":x_val,"y":collision_free_y_coordinate,"bin":bin}) - - - + # iterate through the points and "offset" any that are colliding with a + # point to their left apply the offsets to all subsequent points in the same bin. + # this arranges points in an attractive swarm c-curve where the points + # toward the edges are (weakly) further right. + bin = 0 + offset = 0 for row in list_of_rows: - bin = row["bin"] - #see if we need to "look left" to avoid a possible collision + if bin != row["bin"]: + # we have moved to a new bin, so we need to reset the offset + bin = row["bin"] + offset = 0 + # see if we need to "look left" to avoid a possible collision for other_row in list_of_rows: - if (other_row["bin"]==bin-1 ): - #"bubble" the entry up until we find a slot that avoids a collision - while ((other_row["y"]==row["y"]) - and (((fig_width*(row["x"]-other_row["x"]))/(max_x-min_x) // point_size) < 1)): - print(row) - print(other_row) - print(((fig_width*(row["x"]-other_row["x"] ))/(max_x-min_x) // point_size)) - - print("updating to fix collision") + if (other_row["bin"] == bin-1): + # "bubble" the entry up until we find a slot that avoids a collision + while ((other_row["y_slot"] == row["y_slot"]+offset) + and (((fig_width*(row["x"]-other_row["x"]))/(max_x-min_x) + // point_size) < 1)): + offset += 1 + # update the bin count so we know whether the number of + # *used* slots is even or odd bin_counter.update([bin]) - print(bin_counter[bin]) - row["y"]=(bin_counter[bin]//2)*negative_1_if_count_is_odd(bin_counter[bin])*point_size*gap_multiplier - print(row["y"]) - # if the number of points is even, - # move y-coordinates down to put an equal number of entries above and below the axis + row["y_slot"] += offset + # The collision free y coordinate gives the items in a vertical bin + # y-coordinates to evenly spread their locations above and below the + # y-axis (we'll make a correction below to deal with even numbers of + # entries). For now, we'll assign 0, 1, -1, 2, -2, 3, -3 ... and so on. + # We scale this by the point_size*gap_multiplier to get a y coordinate + # in px. + row["y"] = (row["y_slot"]//2) * \ + negative_1_if_count_is_odd(row["y_slot"])*point_size*gap_multiplier + print(row["y"]) + + # if the number of points is even, move y-coordinates down to put an equal + # number of entries above and below the axis for row in list_of_rows: - if bin_counter[row["bin"]]%2==0: - row["y"]-=point_size*gap_multiplier/2 - + if bin_counter[row["bin"]] % 2 == 0: + row["y"] -= point_size*gap_multiplier/2 df = pd.DataFrame(list_of_rows) - # one way to make this code more flexible to e.g. handle multiple categories would be to return a list of "swarmified" y coordinates here - # you could then generate "swarmified" y coordinates for each category and add category specific offsets before scatterplotting them + # One way to make this code more flexible to e.g. handle multiple categories + # would be to return a list of "swarmified" y coordinates here and then plot + # outside the function. + # That generalization would let you "swarmify" y coordinates for each + # category and add category specific offsets to put the each category in its + # own row fig = px.scatter( df, x="x", y="y", ) - #we want to suppress the y coordinate in the hover value because the y-coordinate is irrelevant/misleading + # we want to suppress the y coordinate in the hover value because the + # y-coordinate is irrelevant/misleading fig.update_traces( marker_size=point_size, - #suppress the y coordinate because the y-coordinate is irrelevant + # suppress the y coordinate because the y-coordinate is irrelevant hovertemplate="value: %{x}", ) - # we have to set the width and height because we aim to avoid icon collisions and we specify the icon size - # in the same units as the width and height - fig.update_layout(width=fig_width, height=(point_size*max(bin_counter.values())+200)) + # we have to set the width and height because we aim to avoid icon collisions + # and we specify the icon size in the same units as the width and height + fig.update_layout(width=fig_width, height=( + point_size*max(bin_counter.values())+200)) fig.update_yaxes( - showticklabels=False, # Turn off y-axis labels - ticks='', # Remove the ticks - title="" + showticklabels=False, # Turn off y-axis labels + ticks='', # Remove the ticks + title="" ) return fig - -df_iris = px.data.iris() # iris is a pandas DataFrame -x = df_iris["sepal_length"] -fig = swarm(x) -fig.show() +df = px.data.iris() # iris is a pandas DataFrame +fig = swarm(df["sepal_length"]) +# here's a more interesting test case for collision avoidance: +#fig = swarm(pd.Series([1, 1.5, 1.78, 1.79, 1.85, 2, +# 2, 2, 2, 3, 3, 2.05, 2.1, 2.2, 2.5, 12])) +fig.show() ``` ## Scatter and line plots with go.Scatter