From 9fa8326d7725e1835249ed6fec9c441ae9c66083 Mon Sep 17 00:00:00 2001
From: Rob Letzler <22990670+rl-utility-man@users.noreply.github.com>
Date: Sat, 19 Apr 2025 23:21:19 -0400
Subject: [PATCH 1/5] add swarm plot to the scatter documentation
This is inspired by #5087
---
doc/python/line-and-scatter.md | 89 ++++++++++++++++++++++++++++++++++
1 file changed, 89 insertions(+)
diff --git a/doc/python/line-and-scatter.md b/doc/python/line-and-scatter.md
index 9ddaad8aac7..ea3ca6ac4a1 100644
--- a/doc/python/line-and-scatter.md
+++ b/doc/python/line-and-scatter.md
@@ -284,6 +284,95 @@ fig.update_traces(textposition="bottom right")
fig.show()
```
+### Swarm (or Beeswarm) Plots
+
+Swarm plots show the distribution of values in a column by giving each entry one dot and adjusting the y-value so that dots do not overlap and appear symmetrically around the y=0 line. They complement histograms, box plots, and violin plots. This example could be generalized to implement a swarm plot for multiple categories by adjusting the y-coordinate for each category.
+
+```python
+import pandas as pd
+import plotly.express as px
+import collections
+
+
+def swarm(
+ X_series,
+ point_size=16,
+ fig_width = 800,
+ gap_multiplier=1.2,
+):
+ #sorting will align columns in attractive arcs rather than having columns the vary unpredicatbly in the x-dimension
+ X_series=X_series.copy().sort_values()
+
+
+ # we need to reason in terms of the marker size that is measured in px
+ # so we need to think about each x-coordinate as being a fraction of the way from the
+ # minimum X value to the maximum X value
+ min_x = min(X_series)
+ max_x = max(X_series)
+
+ list_of_rows = []
+ # we will count the number of points in each "bin" / vertical strip of the graph
+ # to be able to assign a y-coordinate that avoids overlapping
+ bin_counter = collections.Counter()
+
+ for x_val in X_series:
+ # assign this x_value to bin number
+ # each bin is a vertical strip wide enough for one marker
+ bin=(((fig_width*(x_val-min_x))/(max_x-min_x)) // point_size)
+
+ #update the count of dots in that strip
+ bin_counter.update([bin])
+
+ # if this is an odd numbered entry in its bin, make its y coordinate negative
+ # the y coordinate of the first entry is 0, so entries 3, 5, and 7 get negative y coordinates
+ if bin_counter[bin]%2 == 1:
+ negative_1_if_count_is_odd = -1
+ else:
+ negative_1_if_count_is_odd = 1
+
+ # the collision free y coordinate gives the items in a vertical bin
+ # coordinates: 0, 1, -1, 2, -2, 3, -3 ... and so on to evenly spread
+ # their locations above and below the y-axis (we'll make a correction below to deal with even numbers of entries)
+ # we then scale this by the point_size*gap_multiplier to get a y coordinate in px
+
+ collision_free_y_coordinate=(bin_counter[bin]//2)*negative_1_if_count_is_odd*point_size*gap_multiplier
+ list_of_rows.append({"x":x_val,"y":collision_free_y_coordinate,"bin":bin})
+
+ # if the number of points is even,
+ # move y-coordinates down to put an equal number of entries above and below the axis
+ for row in list_of_rows:
+ if bin_counter[row["bin"]]%2==0:
+ row["y"]-=point_size*gap_multiplier/2
+
+ df = pd.DataFrame(list_of_rows)
+
+ fig = px.scatter(
+ df,
+ x="x",
+ y="y",
+ hover_data="x",
+ )
+ #we want to suppress the y coordinate in the hover value because the y-coordinate is irrelevant/misleading
+ fig.update_traces(
+ marker_size=point_size,
+ hovertemplate="value: %{x}",
+ )
+ # we have to set the width and height because we aim to avoid icon collisions and we specify the icon size
+ # in the same units as the width and height
+ fig.update_layout(width=fig_width, height=(point_size*max(bin_counter.values())+200))
+ fig.update_yaxes(
+ showticklabels=False, # Turn off y-axis labels
+ ticks='', # Remove the ticks
+ title=""
+ )
+ fig.show()
+
+
+
+df_iris = px.data.iris() # iris is a pandas DataFrame
+swarm(df_iris["sepal_length"])
+```
+
## Scatter and line plots with go.Scatter
If Plotly Express does not provide a good starting point, it is possible to use [the more generic `go.Scatter` class from `plotly.graph_objects`](/python/graph-objects/). Whereas `plotly.express` has two functions `scatter` and `line`, `go.Scatter` can be used both for plotting points (makers) or lines, depending on the value of `mode`. The different options of `go.Scatter` are documented in its [reference page](https://plotly.com/python/reference/scatter/).
From e563804f06e89aa9a62ecf615e68313ce62954c0 Mon Sep 17 00:00:00 2001
From: Rob Letzler <22990670+rl-utility-man@users.noreply.github.com>
Date: Sat, 19 Apr 2025 23:26:05 -0400
Subject: [PATCH 2/5] ending with fig.show()
---
doc/python/line-and-scatter.md | 6 ++++--
1 file changed, 4 insertions(+), 2 deletions(-)
diff --git a/doc/python/line-and-scatter.md b/doc/python/line-and-scatter.md
index ea3ca6ac4a1..6a741934aa1 100644
--- a/doc/python/line-and-scatter.md
+++ b/doc/python/line-and-scatter.md
@@ -365,12 +365,14 @@ def swarm(
ticks='', # Remove the ticks
title=""
)
- fig.show()
+ return fig
df_iris = px.data.iris() # iris is a pandas DataFrame
-swarm(df_iris["sepal_length"])
+fig = swarm(df_iris["sepal_length"])
+fig.show()
+
```
## Scatter and line plots with go.Scatter
From 15b758055fbce230628e74120a0847cccc4f26d1 Mon Sep 17 00:00:00 2001
From: Rob Letzler <22990670+rl-utility-man@users.noreply.github.com>
Date: Mon, 21 Apr 2025 00:25:55 -0400
Subject: [PATCH 3/5] collision avoidance
---
doc/python/line-and-scatter.md | 56 ++++++++++++++++++++++++----------
1 file changed, 40 insertions(+), 16 deletions(-)
diff --git a/doc/python/line-and-scatter.md b/doc/python/line-and-scatter.md
index 6a741934aa1..de160d77452 100644
--- a/doc/python/line-and-scatter.md
+++ b/doc/python/line-and-scatter.md
@@ -293,12 +293,23 @@ import pandas as pd
import plotly.express as px
import collections
+def negative_1_if_count_is_odd(count):
+ # if this is an odd numbered entry in its bin, make its y coordinate negative
+ # the y coordinate of the first entry is 0, so entries 3, 5, and 7 get negative y coordinates
+ if count%2 == 1:
+ return -1
+ else:
+ return 1
+
+
+
def swarm(
X_series,
point_size=16,
fig_width = 800,
gap_multiplier=1.2,
+ center_even_groups = False
):
#sorting will align columns in attractive arcs rather than having columns the vary unpredicatbly in the x-dimension
X_series=X_series.copy().sort_values()
@@ -309,7 +320,7 @@ def swarm(
# minimum X value to the maximum X value
min_x = min(X_series)
max_x = max(X_series)
-
+
list_of_rows = []
# we will count the number of points in each "bin" / vertical strip of the graph
# to be able to assign a y-coordinate that avoids overlapping
@@ -319,33 +330,43 @@ def swarm(
# assign this x_value to bin number
# each bin is a vertical strip wide enough for one marker
bin=(((fig_width*(x_val-min_x))/(max_x-min_x)) // point_size)
-
+
#update the count of dots in that strip
bin_counter.update([bin])
-
- # if this is an odd numbered entry in its bin, make its y coordinate negative
- # the y coordinate of the first entry is 0, so entries 3, 5, and 7 get negative y coordinates
- if bin_counter[bin]%2 == 1:
- negative_1_if_count_is_odd = -1
- else:
- negative_1_if_count_is_odd = 1
+
# the collision free y coordinate gives the items in a vertical bin
# coordinates: 0, 1, -1, 2, -2, 3, -3 ... and so on to evenly spread
# their locations above and below the y-axis (we'll make a correction below to deal with even numbers of entries)
# we then scale this by the point_size*gap_multiplier to get a y coordinate in px
- collision_free_y_coordinate=(bin_counter[bin]//2)*negative_1_if_count_is_odd*point_size*gap_multiplier
- list_of_rows.append({"x":x_val,"y":collision_free_y_coordinate,"bin":bin})
+ collision_free_y_coordinate=(bin_counter[bin]//2)*negative_1_if_count_is_odd(bin_counter[bin])*point_size*gap_multiplier
+ list_of_rows.append({"x":x_val,"y":collision_free_y_coordinate,"bin":bin, "adj":0})
# if the number of points is even,
# move y-coordinates down to put an equal number of entries above and below the axis
+ #this can sometimes break the collision avoidance routine, but makes small N outputs look better otherwise
+ if center_even_groups:
+ for row in list_of_rows:
+ if bin_counter[row["bin"]]%2==0:
+ row["y"]-=point_size*gap_multiplier/2
+ row["adj"]=-point_size*gap_multiplier/2
+
+
for row in list_of_rows:
- if bin_counter[row["bin"]]%2==0:
- row["y"]-=point_size*gap_multiplier/2
+ bin = row["bin"]
+ #see if we need to "look left" to avoid a possible collision
+ for other_row in list_of_rows:
+ if (other_row["bin"]==bin-1 ):
+ if (((other_row["y"]==row["y"]) or (other_row["y"]==row["y"]+row["adj"]))
+ and (((fig_width*(row["x"]-other_row["x"]))/(max_x-min_x) // point_size) < 1)):
+ bin_counter.update([bin])
+ row["y"]=(bin_counter[bin]//2)*negative_1_if_count_is_odd(bin_counter[bin])*point_size*gap_multiplier+row["adj"]
+
+
df = pd.DataFrame(list_of_rows)
-
+
fig = px.scatter(
df,
x="x",
@@ -370,9 +391,12 @@ def swarm(
df_iris = px.data.iris() # iris is a pandas DataFrame
-fig = swarm(df_iris["sepal_length"])
+x = df_iris["sepal_length"]
+x2 = pd.Series([5.05])
+x = pd.concat([x,x2], ignore_index=True)
+fig = swarm(x)
+#fig = swarm(pd.Series([1,1.5, 1.78, 1.79,2,2,12]))
fig.show()
-
```
## Scatter and line plots with go.Scatter
From 1e4d6b947d5787e7f514573f2d6b3018fdef2b28 Mon Sep 17 00:00:00 2001
From: Rob Letzler <22990670+rl-utility-man@users.noreply.github.com>
Date: Fri, 2 May 2025 20:06:12 -0400
Subject: [PATCH 4/5] replaced a work around with a bug fix.
---
doc/python/line-and-scatter.md | 43 +++++++++++++++++++---------------
1 file changed, 24 insertions(+), 19 deletions(-)
diff --git a/doc/python/line-and-scatter.md b/doc/python/line-and-scatter.md
index de160d77452..d14d79df954 100644
--- a/doc/python/line-and-scatter.md
+++ b/doc/python/line-and-scatter.md
@@ -309,8 +309,8 @@ def swarm(
point_size=16,
fig_width = 800,
gap_multiplier=1.2,
- center_even_groups = False
-):
+ bin_fraction=0.95, #bin fraction slightly undersizes the bins to avoid collisions
+ ):
#sorting will align columns in attractive arcs rather than having columns the vary unpredicatbly in the x-dimension
X_series=X_series.copy().sort_values()
@@ -328,8 +328,9 @@ def swarm(
for x_val in X_series:
# assign this x_value to bin number
- # each bin is a vertical strip wide enough for one marker
- bin=(((fig_width*(x_val-min_x))/(max_x-min_x)) // point_size)
+ # each bin is a vertical strip slightly narrower than one marker
+
+ bin=(((fig_width*bin_fraction*(x_val-min_x))/(max_x-min_x)) // point_size)
#update the count of dots in that strip
bin_counter.update([bin])
@@ -341,16 +342,8 @@ def swarm(
# we then scale this by the point_size*gap_multiplier to get a y coordinate in px
collision_free_y_coordinate=(bin_counter[bin]//2)*negative_1_if_count_is_odd(bin_counter[bin])*point_size*gap_multiplier
- list_of_rows.append({"x":x_val,"y":collision_free_y_coordinate,"bin":bin, "adj":0})
+ list_of_rows.append({"x":x_val,"y":collision_free_y_coordinate,"bin":bin})
- # if the number of points is even,
- # move y-coordinates down to put an equal number of entries above and below the axis
- #this can sometimes break the collision avoidance routine, but makes small N outputs look better otherwise
- if center_even_groups:
- for row in list_of_rows:
- if bin_counter[row["bin"]]%2==0:
- row["y"]-=point_size*gap_multiplier/2
- row["adj"]=-point_size*gap_multiplier/2
for row in list_of_rows:
@@ -358,24 +351,39 @@ def swarm(
#see if we need to "look left" to avoid a possible collision
for other_row in list_of_rows:
if (other_row["bin"]==bin-1 ):
- if (((other_row["y"]==row["y"]) or (other_row["y"]==row["y"]+row["adj"]))
+ #"bubble" the entry up until we find a slot that avoids a collision
+ while ((other_row["y"]==row["y"])
and (((fig_width*(row["x"]-other_row["x"]))/(max_x-min_x) // point_size) < 1)):
+ print(row)
+ print(other_row)
+ print(((fig_width*(row["x"]-other_row["x"] ))/(max_x-min_x) // point_size))
+
+ print("updating to fix collision")
bin_counter.update([bin])
- row["y"]=(bin_counter[bin]//2)*negative_1_if_count_is_odd(bin_counter[bin])*point_size*gap_multiplier+row["adj"]
+ print(bin_counter[bin])
+ row["y"]=(bin_counter[bin]//2)*negative_1_if_count_is_odd(bin_counter[bin])*point_size*gap_multiplier
+ print(row["y"])
+ # if the number of points is even,
+ # move y-coordinates down to put an equal number of entries above and below the axis
+ for row in list_of_rows:
+ if bin_counter[row["bin"]]%2==0:
+ row["y"]-=point_size*gap_multiplier/2
df = pd.DataFrame(list_of_rows)
+ # one way to make this code more flexible to e.g. handle multiple categories would be to return a list of "swarmified" y coordinates here
+ # you could then generate "swarmified" y coordinates for each category and add category specific offsets before scatterplotting them
fig = px.scatter(
df,
x="x",
y="y",
- hover_data="x",
)
#we want to suppress the y coordinate in the hover value because the y-coordinate is irrelevant/misleading
fig.update_traces(
marker_size=point_size,
+ #suppress the y coordinate because the y-coordinate is irrelevant
hovertemplate="value: %{x}",
)
# we have to set the width and height because we aim to avoid icon collisions and we specify the icon size
@@ -392,10 +400,7 @@ def swarm(
df_iris = px.data.iris() # iris is a pandas DataFrame
x = df_iris["sepal_length"]
-x2 = pd.Series([5.05])
-x = pd.concat([x,x2], ignore_index=True)
fig = swarm(x)
-#fig = swarm(pd.Series([1,1.5, 1.78, 1.79,2,2,12]))
fig.show()
```
From 5469864eafbaa73eff467ba732e3a949ae589807 Mon Sep 17 00:00:00 2001
From: Rob Letzler <22990670+rl-utility-man@users.noreply.github.com>
Date: Sat, 3 May 2025 23:25:20 -0400
Subject: [PATCH 5/5] maintain collision avoidance while arranging points in
c-curves
---
doc/python/line-and-scatter.md | 140 ++++++++++++++++++---------------
1 file changed, 78 insertions(+), 62 deletions(-)
diff --git a/doc/python/line-and-scatter.md b/doc/python/line-and-scatter.md
index d14d79df954..e262cf60762 100644
--- a/doc/python/line-and-scatter.md
+++ b/doc/python/line-and-scatter.md
@@ -293,30 +293,32 @@ import pandas as pd
import plotly.express as px
import collections
-def negative_1_if_count_is_odd(count):
- # if this is an odd numbered entry in its bin, make its y coordinate negative
- # the y coordinate of the first entry is 0, so entries 3, 5, and 7 get negative y coordinates
- if count%2 == 1:
- return -1
- else:
- return 1
-
+def negative_1_if_count_is_odd(count):
+ # if this is an odd numbered entry in its bin, make its y coordinate negative
+ # the y coordinate of the first entry is 0, so entries 3, 5, and 7 get
+ # negative y coordinates
+ if count % 2 == 1:
+ return -1
+ else:
+ return 1
def swarm(
X_series,
point_size=16,
- fig_width = 800,
+ fig_width=800,
gap_multiplier=1.2,
- bin_fraction=0.95, #bin fraction slightly undersizes the bins to avoid collisions
- ):
- #sorting will align columns in attractive arcs rather than having columns the vary unpredicatbly in the x-dimension
- X_series=X_series.copy().sort_values()
-
+ bin_fraction=0.95, # slightly undersizes the bins to avoid collisions
+):
+ # sorting will align columns in attractive c-shaped arcs rather than having
+ # columns that vary unpredictably in the x-dimension.
+ # We also exploit the fact that sorting means we see bins sequentially when
+ # we add collision prevention offsets.
+ X_series = X_series.copy().sort_values()
# we need to reason in terms of the marker size that is measured in px
- # so we need to think about each x-coordinate as being a fraction of the way from the
+ # so we need to think about each x-coordinate as being a fraction of the way from the
# minimum X value to the maximum X value
min_x = min(X_series)
max_x = max(X_series)
@@ -329,79 +331,93 @@ def swarm(
for x_val in X_series:
# assign this x_value to bin number
# each bin is a vertical strip slightly narrower than one marker
-
- bin=(((fig_width*bin_fraction*(x_val-min_x))/(max_x-min_x)) // point_size)
+ bin = (((fig_width*bin_fraction*(x_val-min_x))/(max_x-min_x)) // point_size)
- #update the count of dots in that strip
+ # update the count of dots in that strip
bin_counter.update([bin])
+ # remember the "y-slot" which tells us the number of points in this bin and is sufficient to compute the y coordinate unless there's a collision with the point to its left
+ list_of_rows.append(
+ {"x": x_val, "y_slot": bin_counter[bin], "bin": bin})
- # the collision free y coordinate gives the items in a vertical bin
- # coordinates: 0, 1, -1, 2, -2, 3, -3 ... and so on to evenly spread
- # their locations above and below the y-axis (we'll make a correction below to deal with even numbers of entries)
- # we then scale this by the point_size*gap_multiplier to get a y coordinate in px
-
- collision_free_y_coordinate=(bin_counter[bin]//2)*negative_1_if_count_is_odd(bin_counter[bin])*point_size*gap_multiplier
- list_of_rows.append({"x":x_val,"y":collision_free_y_coordinate,"bin":bin})
-
-
-
+ # iterate through the points and "offset" any that are colliding with a
+ # point to their left apply the offsets to all subsequent points in the same bin.
+ # this arranges points in an attractive swarm c-curve where the points
+ # toward the edges are (weakly) further right.
+ bin = 0
+ offset = 0
for row in list_of_rows:
- bin = row["bin"]
- #see if we need to "look left" to avoid a possible collision
+ if bin != row["bin"]:
+ # we have moved to a new bin, so we need to reset the offset
+ bin = row["bin"]
+ offset = 0
+ # see if we need to "look left" to avoid a possible collision
for other_row in list_of_rows:
- if (other_row["bin"]==bin-1 ):
- #"bubble" the entry up until we find a slot that avoids a collision
- while ((other_row["y"]==row["y"])
- and (((fig_width*(row["x"]-other_row["x"]))/(max_x-min_x) // point_size) < 1)):
- print(row)
- print(other_row)
- print(((fig_width*(row["x"]-other_row["x"] ))/(max_x-min_x) // point_size))
-
- print("updating to fix collision")
+ if (other_row["bin"] == bin-1):
+ # "bubble" the entry up until we find a slot that avoids a collision
+ while ((other_row["y_slot"] == row["y_slot"]+offset)
+ and (((fig_width*(row["x"]-other_row["x"]))/(max_x-min_x)
+ // point_size) < 1)):
+ offset += 1
+ # update the bin count so we know whether the number of
+ # *used* slots is even or odd
bin_counter.update([bin])
- print(bin_counter[bin])
- row["y"]=(bin_counter[bin]//2)*negative_1_if_count_is_odd(bin_counter[bin])*point_size*gap_multiplier
- print(row["y"])
- # if the number of points is even,
- # move y-coordinates down to put an equal number of entries above and below the axis
+ row["y_slot"] += offset
+ # The collision free y coordinate gives the items in a vertical bin
+ # y-coordinates to evenly spread their locations above and below the
+ # y-axis (we'll make a correction below to deal with even numbers of
+ # entries). For now, we'll assign 0, 1, -1, 2, -2, 3, -3 ... and so on.
+ # We scale this by the point_size*gap_multiplier to get a y coordinate
+ # in px.
+ row["y"] = (row["y_slot"]//2) * \
+ negative_1_if_count_is_odd(row["y_slot"])*point_size*gap_multiplier
+ print(row["y"])
+
+ # if the number of points is even, move y-coordinates down to put an equal
+ # number of entries above and below the axis
for row in list_of_rows:
- if bin_counter[row["bin"]]%2==0:
- row["y"]-=point_size*gap_multiplier/2
-
+ if bin_counter[row["bin"]] % 2 == 0:
+ row["y"] -= point_size*gap_multiplier/2
df = pd.DataFrame(list_of_rows)
- # one way to make this code more flexible to e.g. handle multiple categories would be to return a list of "swarmified" y coordinates here
- # you could then generate "swarmified" y coordinates for each category and add category specific offsets before scatterplotting them
+ # One way to make this code more flexible to e.g. handle multiple categories
+ # would be to return a list of "swarmified" y coordinates here and then plot
+ # outside the function.
+ # That generalization would let you "swarmify" y coordinates for each
+ # category and add category specific offsets to put the each category in its
+ # own row
fig = px.scatter(
df,
x="x",
y="y",
)
- #we want to suppress the y coordinate in the hover value because the y-coordinate is irrelevant/misleading
+ # we want to suppress the y coordinate in the hover value because the
+ # y-coordinate is irrelevant/misleading
fig.update_traces(
marker_size=point_size,
- #suppress the y coordinate because the y-coordinate is irrelevant
+ # suppress the y coordinate because the y-coordinate is irrelevant
hovertemplate="value: %{x}",
)
- # we have to set the width and height because we aim to avoid icon collisions and we specify the icon size
- # in the same units as the width and height
- fig.update_layout(width=fig_width, height=(point_size*max(bin_counter.values())+200))
+ # we have to set the width and height because we aim to avoid icon collisions
+ # and we specify the icon size in the same units as the width and height
+ fig.update_layout(width=fig_width, height=(
+ point_size*max(bin_counter.values())+200))
fig.update_yaxes(
- showticklabels=False, # Turn off y-axis labels
- ticks='', # Remove the ticks
- title=""
+ showticklabels=False, # Turn off y-axis labels
+ ticks='', # Remove the ticks
+ title=""
)
return fig
-
-df_iris = px.data.iris() # iris is a pandas DataFrame
-x = df_iris["sepal_length"]
-fig = swarm(x)
-fig.show()
+df = px.data.iris() # iris is a pandas DataFrame
+fig = swarm(df["sepal_length"])
+# here's a more interesting test case for collision avoidance:
+#fig = swarm(pd.Series([1, 1.5, 1.78, 1.79, 1.85, 2,
+# 2, 2, 2, 3, 3, 2.05, 2.1, 2.2, 2.5, 12]))
+fig.show()
```
## Scatter and line plots with go.Scatter