From aebbb06116184ead8e5f98a4c7588d75e5e3a54f Mon Sep 17 00:00:00 2001
From: ronanstokes-db <ronan.stokes@databricks.com>
Date: Sat, 25 Mar 2023 17:18:57 -0700
Subject: [PATCH 01/20] wip

---
 docs/source/extending_text_generation.rst |  6 +-
 docs/source/generating_cdc_data.rst       | 53 ++++++-------
 docs/source/generating_json_data.rst      | 91 +++++++++++-----------
 docs/source/multi_table_data.rst          | 91 +++++++++++-----------
 docs/source/troubleshooting.rst           | 93 ++++++++++++-----------
 5 files changed, 174 insertions(+), 160 deletions(-)

diff --git a/docs/source/extending_text_generation.rst b/docs/source/extending_text_generation.rst
index 7f0df8a4..f59e7171 100644
--- a/docs/source/extending_text_generation.rst
+++ b/docs/source/extending_text_generation.rst
@@ -38,7 +38,7 @@ extended syntax.
                .withColumn("address", text=fakerText("address" ))
                .withColumn("email", text=fakerText("ascii_company_email") )
                .withColumn("ip_address", text=fakerText("ipv4_private" ))
-               .withColumn("faker_text", text=fakerText("sentence", ext_word_list=my_word_list) )
+               .withColumn("faker_text", text=fakerText("sentence", ext_word_list=my_word_list))
                )
    dfFakerOnly = fakerDataspec.build()
 
@@ -91,7 +91,9 @@ The following code shows use of a custom Python function to generate text:
 
    pluginDataspec = (DataGenerator(spark, rows=data_rows, partitions=partitions_requested,
                      randomSeedMethod="hash_fieldname")
-                     .withColumn("text", text=PyfuncText(text_generator, initFn=initPluginContext))
+                     .withColumn("text",
+                                 text=PyfuncText(text_generator,
+                                 initFn=initPluginContext))
                     )
 
    dfPlugin = pluginDataspec.build()
diff --git a/docs/source/generating_cdc_data.rst b/docs/source/generating_cdc_data.rst
index 1633ce41..ccbf16b4 100644
--- a/docs/source/generating_cdc_data.rst
+++ b/docs/source/generating_cdc_data.rst
@@ -1,7 +1,7 @@
 .. Test Data Generator documentation master file, created by
-   sphinx-quickstart on Sun Jun 21 10:54:30 2020.
-   You can adapt this file completely to your liking, but it should at least
-   contain the root `toctree` directive.
+sphinx-quickstart on Sun Jun 21 10:54:30 2020.
+You can adapt this file completely to your liking, but it should at least
+contain the root `toctree` directive.
 
 Generating Change Data Capture Data
 ===================================
@@ -47,28 +47,30 @@ We'll add a timestamp for when the row was generated and a memo field to mark wh
 
    uniqueCustomers = 10 * 1000000
 
-   dataspec = (dg.DataGenerator(spark, rows=data_rows, partitions=partitions_requested)
-               .withColumn("customer_id","long", uniqueValues=uniqueCustomers)
-               .withColumn("name", percentNulls=0.01, template=r'\\w \\w|\\w a. \\w')
-               .withColumn("alias", percentNulls=0.01, template=r'\\w \\w|\\w a. \\w')
-               .withColumn("payment_instrument_type", values=['paypal', 'Visa', 'Mastercard',
-                           'American Express', 'discover', 'branded visa', 'branded mastercard'],
-                           random=True, distribution="normal")
-               .withColumn("int_payment_instrument", "int",  minValue=0000, maxValue=9999,  baseColumn="customer_id",
-                           baseColumnType="hash", omit=True)
-               .withColumn("payment_instrument", expr="format_number(int_payment_instrument, '**** ****** *####')",
-                           baseColumn="int_payment_instrument")
-               .withColumn("email", template=r'\\w.\\w@\\w.com|\\w-\\w@\\w')
-               .withColumn("email2", template=r'\\w.\\w@\\w.com')
-               .withColumn("ip_address", template=r'\\n.\\n.\\n.\\n')
-               .withColumn("md5_payment_instrument",
-                           expr="md5(concat(payment_instrument_type, ':', payment_instrument))",
-                           base_column=['payment_instrument_type', 'payment_instrument'])
-               .withColumn("customer_notes", text=dg.ILText(words=(1,8)))
-               .withColumn("created_ts", "timestamp", expr="now()")
-               .withColumn("modified_ts", "timestamp", expr="now()")
-               .withColumn("memo", expr="'original data'")
-               )
+   dataspec = (
+       dg.DataGenerator(spark, rows=data_rows, partitions=partitions_requested)
+         .withColumn("customer_id","long", uniqueValues=uniqueCustomers)
+         .withColumn("name", percentNulls=0.01, template=r'\\w \\w|\\w a. \\w')
+         .withColumn("alias", percentNulls=0.01, template=r'\\w \\w|\\w a. \\w')
+         .withColumn("payment_instrument_type", values=['paypal', 'Visa', 'Mastercard',
+                     'American Express', 'discover', 'branded visa', 'branded mastercard'],
+                     random=True, distribution="normal")
+         .withColumn("int_payment_instrument", "int",  minValue=0000, maxValue=9999,
+                     baseColumn="customer_id", baseColumnType="hash", omit=True)
+         .withColumn("payment_instrument",
+                     expr="format_number(int_payment_instrument, '**** ****** *####')",
+                     baseColumn="int_payment_instrument")
+         .withColumn("email", template=r'\\w.\\w@\\w.com|\\w-\\w@\\w')
+         .withColumn("email2", template=r'\\w.\\w@\\w.com')
+         .withColumn("ip_address", template=r'\\n.\\n.\\n.\\n')
+         .withColumn("md5_payment_instrument",
+                     expr="md5(concat(payment_instrument_type, ':', payment_instrument))",
+                     base_column=['payment_instrument_type', 'payment_instrument'])
+         .withColumn("customer_notes", text=dg.ILText(words=(1,8)))
+         .withColumn("created_ts", "timestamp", expr="now()")
+         .withColumn("modified_ts", "timestamp", expr="now()")
+         .withColumn("memo", expr="'original data'")
+         )
    df1 = dataspec.build()
 
    # write table
@@ -168,7 +170,6 @@ values of the columns from the source table will be used.
                                                     ])
 
    print(sqlStmt)
-
    spark.sql(sqlStmt)
 
 That's all that's required to perform merges with the data generation framework.
diff --git a/docs/source/generating_json_data.rst b/docs/source/generating_json_data.rst
index 56989d76..a26db3ee 100644
--- a/docs/source/generating_json_data.rst
+++ b/docs/source/generating_json_data.rst
@@ -195,51 +195,52 @@ functions such as `named_struct` and `to_json`.
 
    lines = ['delta', 'xyzzy', 'lakehouse', 'gadget', 'droid']
 
-   testDataSpec = (dg.DataGenerator(spark, name="device_data_set", rows=1000000,
-                                    partitions=8,
-                                    randomSeedMethod='hash_fieldname')
-                   .withIdOutput()
-                   # we'll use hash of the base field to generate the ids to
-                   # avoid a simple incrementing sequence
-                   .withColumn("internal_device_id", LongType(), minValue=0x1000000000000,
-                               uniqueValues=device_population, omit=True, baseColumnType="hash")
-
-                   # note for format strings, we must use "%lx" not "%x" as the
-                   # underlying value is a long
-                   .withColumn("device_id", StringType(), format="0x%013x",
-                               baseColumn="internal_device_id")
-
-                   # the device / user attributes will be the same for the same device id
-                   # so lets use the internal device id as the base column for these attribute
-                   .withColumn("country", StringType(), values=country_codes,
-                               weights=country_weights,
-                               baseColumn="internal_device_id")
-
-                   .withColumn("manufacturer", StringType(), values=manufacturers,
-                               baseColumn="internal_device_id", omit=True)
-                   .withColumn("line", StringType(), values=lines, baseColumn="manufacturer",
-                               baseColumnType="hash", omit=True)
-                   .withColumn("manufacturer_info", "string",
-                               expr="to_json(named_struct('line', line, 'manufacturer', manufacturer))",
-                               baseColumn=['manufacturer', 'line'])
-
-
-                   .withColumn("model_ser", IntegerType(), minValue=1, maxValue=11,
-                               baseColumn="device_id",
-                               baseColumnType="hash", omit=True)
-
-                   .withColumn("event_type", StringType(),
-                               values=["activation", "deactivation", "plan change",
-                                       "telecoms activity", "internet activity", "device error"],
-                               random=True, omit=True)
-                   .withColumn("event_ts", "timestamp",
-                               begin="2020-01-01 01:00:00", end="2020-12-31 23:59:00",
-                               interval="1 minute", random=True, omit=True)
-
-                   .withColumn("event_info", "string",
-                               expr="to_json(named_struct('event_type', event_type, 'event_ts', event_ts))",
-                               baseColumn=['event_type', 'event_ts'])
-                   )
+   testDataSpec = (
+       dg.DataGenerator(spark, name="device_data_set", rows=1000000,
+                        partitions=8,
+                        randomSeedMethod='hash_fieldname')
+       .withIdOutput()
+       # we'll use hash of the base field to generate the ids to
+       # avoid a simple incrementing sequence
+       .withColumn("internal_device_id", LongType(), minValue=0x1000000000000,
+                   uniqueValues=device_population, omit=True, baseColumnType="hash")
+
+       # note for format strings, we must use "%lx" not "%x" as the
+       # underlying value is a long
+       .withColumn("device_id", StringType(), format="0x%013x",
+                   baseColumn="internal_device_id")
+
+       # the device / user attributes will be the same for the same device id
+       # so lets use the internal device id as the base column for these attribute
+       .withColumn("country", StringType(), values=country_codes,
+                   weights=country_weights,
+                   baseColumn="internal_device_id")
+
+       .withColumn("manufacturer", StringType(), values=manufacturers,
+                   baseColumn="internal_device_id", omit=True)
+       .withColumn("line", StringType(), values=lines, baseColumn="manufacturer",
+                   baseColumnType="hash", omit=True)
+       .withColumn("manufacturer_info", "string",
+                   expr="to_json(named_struct('line', line, 'manufacturer', manufacturer))",
+                   baseColumn=['manufacturer', 'line'])
+
+
+       .withColumn("model_ser", IntegerType(), minValue=1, maxValue=11,
+                   baseColumn="device_id",
+                   baseColumnType="hash", omit=True)
+
+       .withColumn("event_type", StringType(),
+                   values=["activation", "deactivation", "plan change",
+                           "telecoms activity", "internet activity", "device error"],
+                   random=True, omit=True)
+       .withColumn("event_ts", "timestamp",
+                   begin="2020-01-01 01:00:00", end="2020-12-31 23:59:00",
+                   interval="1 minute", random=True, omit=True)
+
+       .withColumn("event_info", "string",
+                   expr="to_json(named_struct('event_type', event_type, 'event_ts', event_ts))",
+                   baseColumn=['event_type', 'event_ts'])
+       )
 
    dfTestData = testDataSpec.build()
 
diff --git a/docs/source/multi_table_data.rst b/docs/source/multi_table_data.rst
index 5eab313d..d5aa5ec2 100644
--- a/docs/source/multi_table_data.rst
+++ b/docs/source/multi_table_data.rst
@@ -1,7 +1,7 @@
 .. Test Data Generator documentation master file, created by
-   sphinx-quickstart on Sun Jun 21 10:54:30 2020.
-   You can adapt this file completely to your liking, but it should at least
-   contain the root `toctree` directive.
+sphinx-quickstart on Sun Jun 21 10:54:30 2020.
+You can adapt this file completely to your liking, but it should at least
+contain the root `toctree` directive.
 
 Generating and Using Data with Multiple Tables
 ==============================================
@@ -73,7 +73,9 @@ Here we use a simple sequence for our plan ids.
    import dbldatagen as dg
    import pyspark.sql.functions as F
 
-   spark.catalog.clearCache()  # clear cache so that if we run multiple times to check performance, we're not relying on cache
+   # clear cache so that if we run multiple times to check performance,
+   # we're not relying on cache
+   spark.catalog.clearCache()
 
    UNIQUE_PLANS = 20
    PLAN_MIN_VALUE = 100
@@ -87,36 +89,35 @@ Here we use a simple sequence for our plan ids.
    spark.conf.set("spark.sql.execution.arrow.maxRecordsPerBatch", 20000)
 
 
-   plan_dataspec = (dg.DataGenerator(spark, rows=data_rows, partitions=partitions_requested)
-               .withColumn("plan_id","int", minValue=PLAN_MIN_VALUE, uniqueValues=UNIQUE_PLANS)
-               # use plan_id as root value
-               .withColumn("plan_name", prefix="plan", baseColumn="plan_id")
-
-               # note default step is 1 so you must specify a step for small number ranges,
-               .withColumn("cost_per_mb", "decimal(5,3)", minValue=0.005, maxValue=0.050,
-                           step=0.005, random=True)
-               .withColumn("cost_per_message", "decimal(5,3)", minValue=0.001, maxValue=0.02,
-                           step=0.001, random=True)
-               .withColumn("cost_per_minute", "decimal(5,3)", minValue=0.001, maxValue=0.01,
-                           step=0.001, random=True)
-
-               # we're modelling long distance and international prices simplistically -
-               # each is a multiplier thats applied to base rate
-               .withColumn("ld_multiplier", "decimal(5,3)", minValue=1.5, maxValue=3, step=0.05,
-                           random=True, distribution="normal", omit=True)
-               .withColumn("ld_cost_per_minute", "decimal(5,3)",
-                           expr="cost_per_minute * ld_multiplier",
-                           baseColumns=['cost_per_minute', 'ld_multiplier'])
-               .withColumn("intl_multiplier", "decimal(5,3)", minValue=2, maxValue=4, step=0.05,
-                           random=True,  distribution="normal", omit=True)
-               .withColumn("intl_cost_per_minute", "decimal(5,3)",
-                           expr="cost_per_minute * intl_multiplier",
-                           baseColumns=['cost_per_minute', 'intl_multiplier'])
+   plan_dataspec = (
+       dg.DataGenerator(spark, rows=data_rows, partitions=partitions_requested)
+       .withColumn("plan_id","int", minValue=PLAN_MIN_VALUE, uniqueValues=UNIQUE_PLANS)
+       # use plan_id as root value
+       .withColumn("plan_name", prefix="plan", baseColumn="plan_id")
+
+       # note default step is 1 so you must specify a step for small number ranges,
+       .withColumn("cost_per_mb", "decimal(5,3)", minValue=0.005, maxValue=0.050,
+                   step=0.005, random=True)
+       .withColumn("cost_per_message", "decimal(5,3)", minValue=0.001, maxValue=0.02,
+                   step=0.001, random=True)
+       .withColumn("cost_per_minute", "decimal(5,3)", minValue=0.001, maxValue=0.01,
+                   step=0.001, random=True)
+
+       # we're modelling long distance and international prices simplistically -
+       # each is a multiplier thats applied to base rate
+       .withColumn("ld_multiplier", "decimal(5,3)", minValue=1.5, maxValue=3, step=0.05,
+                   random=True, distribution="normal", omit=True)
+       .withColumn("ld_cost_per_minute", "decimal(5,3)",
+                   expr="cost_per_minute * ld_multiplier",
+                   baseColumns=['cost_per_minute', 'ld_multiplier'])
+       .withColumn("intl_multiplier", "decimal(5,3)", minValue=2, maxValue=4, step=0.05,
+                   random=True,  distribution="normal", omit=True)
+       .withColumn("intl_cost_per_minute", "decimal(5,3)",
+                   expr="cost_per_minute * intl_multiplier",
+                   baseColumns=['cost_per_minute', 'intl_multiplier'])
                )
 
-   df_plans = (plan_dataspec.build()
-               .cache()
-              )
+   df_plans = plan_dataspec.build().cache()
 
    display(df_plans)
 
@@ -195,10 +196,11 @@ when using hashed values, the range of the hashes produced can be large.
 
    effective_customers = df_customers.count()
 
-   print(stripMargin(f"""revised customers : {df_customers.count()},
-          |   unique customers: {df_customers.select(F.countDistinct('customer_id')).take(1)[0][0]},
-          |   unique device ids: {df_customers.select(F.countDistinct('device_id')).take(1)[0][0]},
-          |   unique phone numbers: {df_customers.select(F.countDistinct('phone_number')).take(1)[0][0]}""")
+   print(stripMargin(
+     f"""revised customers : {df_customers.count()},
+      |   unique customers: {df_customers.select(F.countDistinct('customer_id')).take(1)[0][0]},
+      |   unique device ids: {df_customers.select(F.countDistinct('device_id')).take(1)[0][0]},
+      |   unique phone numbers: {df_customers.select(F.countDistinct('phone_number')).take(1)[0][0]}""")
         )
 
    display(df_customers)
@@ -247,7 +249,8 @@ A simple approach is simply to multiply the
    # use random seed method of 'hash_fieldname' for better spread - default in later builds
    events_dataspec = (dg.DataGenerator(spark, rows=data_rows, partitions=partitions_requested,
                       randomSeed=42, randomSeedMethod="hash_fieldname")
-                # use same logic as per customers dataset to ensure matching keys - but make them random
+                # use same logic as per customers dataset to ensure matching keys
+                # but make them random
                .withColumn("device_id_base","decimal(10)", minValue=CUSTOMER_MIN_VALUE,
                            uniqueValues=UNIQUE_CUSTOMERS,
                            random=True, omit=True)
@@ -260,12 +263,14 @@ A simple approach is simply to multiply the
                            weights=[50, 50, 20, 10, 5 ], random=True)
 
                # use Gamma distribution for skew towards short calls
-               .withColumn("base_minutes","decimal(7,2)",  minValue=1.0, maxValue=100.0, step=0.1,
+               .withColumn("base_minutes","decimal(7,2)",
+                           minValue=1.0, maxValue=100.0, step=0.1,
                            distribution=dg.distributions.Gamma(shape=1.5, scale=2.0),
                            random=True, omit=True)
 
                # use Gamma distribution for skew towards short transfers
-               .withColumn("base_bytes_transferred","decimal(12)",  minValue=K_1, maxValue=MB_100,
+               .withColumn("base_bytes_transferred","decimal(12)",
+                           minValue=K_1, maxValue=MB_100,
                            distribution=dg.distributions.Gamma(shape=0.75, scale=2.0),
                            random=True, omit=True)
 
@@ -308,8 +313,7 @@ Let's compute the customers and associated plans
    import pyspark.sql.functions as F
    import pyspark.sql.types as T
 
-   df_customer_pricing = df_customers.join(df_plans,
-                                           df_plans.plan_id == df_customers.plan)
+   df_customer_pricing = df_customers.join(df_plans, df_plans.plan_id == df_customers.plan)
 
    display(df_customer_pricing)
 
@@ -365,8 +369,9 @@ now let's compute the invoices
 
 .. code-block:: python
 
-   df_customer_summary = (df_customer_pricing.join(df_summary,
-                                                   df_customer_pricing.device_id == df_summary.device_id )
+   df_customer_summary = (
+         df_customer_pricing.join(df_summary,
+                                   df_customer_pricing.device_id == df_summary.device_id )
                           .createOrReplaceTempView("customer_summary"))
 
    df_invoices = spark.sql("""
diff --git a/docs/source/troubleshooting.rst b/docs/source/troubleshooting.rst
index 386d35ab..83d721b3 100644
--- a/docs/source/troubleshooting.rst
+++ b/docs/source/troubleshooting.rst
@@ -165,50 +165,55 @@ In these cases, we use the `baseColumn` attribute to ensure the correct column b
 
    lines = ['delta', 'xyzzy', 'lakehouse', 'gadget', 'droid']
 
-   testDataSpec = (dg.DataGenerator(spark, name="device_data_set", rows=1000000,
-                                    partitions=8,
-                                    randomSeedMethod='hash_fieldname')
-                   # we'll use hash of the base field to generate the ids to
-                   # avoid a simple incrementing sequence
-                   .withColumn("internal_device_id", "long", minValue=0x1000000000000,
-                               uniqueValues=device_population, omit=True, baseColumnType="hash")
-
-                   # note for format strings, we must use "%lx" not "%x" as the
-                   # underlying value is a long
-                   .withColumn("device_id", "string", format="0x%013x",
-                               baseColumn="internal_device_id")
-
-                   # the device / user attributes will be the same for the same device id
-                   # so lets use the internal device id as the base column for these attribute
-                   .withColumn("country", "string", values=country_codes,
-                               weights=country_weights,
-                               baseColumn="internal_device_id")
-
-                   .withColumn("manufacturer", "string", values=manufacturers,
-                               baseColumn="internal_device_id", omit=True)
-
-                   .withColumn("line", StringType(), values=lines, baseColumn="manufacturer",
-                               baseColumnType="hash", omit=True)
-
-                   # note use of baseColumn to control column build ordering
-                   .withColumn("manufacturer_info", "string",
-                                expr="to_json(named_struct('line', line, 'manufacturer', manufacturer))",
-                               baseColumn=["line", "manufacturer"]
-                              )
-
-                   .withColumn("event_type", "string",
-                               values=["activation", "deactivation", "plan change",
-                                       "telecoms activity", "internet activity", "device error"],
-                               random=True, omit=True)
-
-                   .withColumn("event_ts", "timestamp", begin="2020-01-01 01:00:00", end="2020-12-31 23:59:00",
-                               interval="1 minute", random=True, omit=True)
-
-                   # note use of baseColumn to control column build ordering
-                   .withColumn("event_info", "string",
-                                expr="to_json(named_struct('event_type', event_type, 'event_ts', event_ts))",
-                                baseColumn=["event_type", "event_ts"])
-                   )
+   testDataSpec = (
+       dg.DataGenerator(spark, name="device_data_set", rows=1000000,
+                        partitions=8,
+                        randomSeedMethod='hash_fieldname')
+       # we'll use hash of the base field to generate the ids to
+       # avoid a simple incrementing sequence
+       .withColumn("internal_device_id", "long", minValue=0x1000000000000,
+                   uniqueValues=device_population, omit=True, baseColumnType="hash")
+
+       # note for format strings, we must use "%lx" not "%x" as the
+       # underlying value is a long
+       .withColumn("device_id", "string", format="0x%013x",
+                   baseColumn="internal_device_id")
+
+       # the device / user attributes will be the same for the same device id
+       # so lets use the internal device id as the base column for these attribute
+       .withColumn("country", "string", values=country_codes,
+                   weights=country_weights,
+                   baseColumn="internal_device_id")
+
+       .withColumn("manufacturer", "string", values=manufacturers,
+                   baseColumn="internal_device_id", omit=True)
+
+       .withColumn("line", StringType(), values=lines, baseColumn="manufacturer",
+                   baseColumnType="hash", omit=True)
+
+       # note use of baseColumn to control column build ordering
+       .withColumn("manufacturer_info", "string",
+                    expr="to_json(named_struct('line', line, 'manufacturer', manufacturer))",
+                   baseColumn=["line", "manufacturer"]
+                  )
+
+       .withColumn("event_type", "string",
+                   values=["activation", "deactivation", "plan change",
+                           "telecoms activity", "internet activity", "device error"],
+                   random=True, omit=True)
+
+       .withColumn("event_ts", "timestamp",
+                   begin="2020-01-01 01:00:00",
+                   end="2020-12-31 23:59:00",
+                   interval="1 minute",
+                   random=True,
+                   omit=True)
+
+       # note use of baseColumn to control column build ordering
+       .withColumn("event_info", "string",
+                    expr="to_json(named_struct('event_type', event_type, 'event_ts', event_ts))",
+                    baseColumn=["event_type", "event_ts"])
+       )
 
    dfTestData = testDataSpec.build()
 

From c4fdc3bc8e77a7eeaaa2ff03797cf160675cb79b Mon Sep 17 00:00:00 2001
From: ronanstokes-db <ronan.stokes@databricks.com>
Date: Tue, 9 May 2023 11:59:45 -0700
Subject: [PATCH 02/20] wip

---
 python/dev_require.txt | 1 -
 python/require.txt     | 1 -
 2 files changed, 2 deletions(-)

diff --git a/python/dev_require.txt b/python/dev_require.txt
index 8d53d810..a34ed3b2 100644
--- a/python/dev_require.txt
+++ b/python/dev_require.txt
@@ -31,7 +31,6 @@ pypandoc
 ipython==7.22.0
 recommonmark
 sphinx-markdown-builder
-rst2pdf==0.98
 Jinja2 < 3.1
 sphinx-copybutton
 
diff --git a/python/require.txt b/python/require.txt
index 53f80fde..5f0e30a4 100644
--- a/python/require.txt
+++ b/python/require.txt
@@ -30,7 +30,6 @@ pypandoc
 ipython==7.22.0
 recommonmark
 sphinx-markdown-builder
-rst2pdf==0.98
 Jinja2 < 3.1
 sphinx-copybutton
 

From 46969e7647be3fa2641ab69bf2dcea21727f6920 Mon Sep 17 00:00:00 2001
From: ronanstokes-db <ronan.stokes@databricks.com>
Date: Tue, 9 May 2023 12:07:07 -0700
Subject: [PATCH 03/20] wip

---
 .github/workflows/push.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/push.yml b/.github/workflows/push.yml
index 7bc7bc88..fbdfa548 100644
--- a/.github/workflows/push.yml
+++ b/.github/workflows/push.yml
@@ -31,7 +31,7 @@ jobs:
       - name: Set up Python 3.8
         uses: actions/setup-python@v3
         with:
-          python-version: '3.8'
+          python-version: '3.8.10'
           cache: 'pipenv'
 
       - name: Install 

From cf9e909e555eb218b772c0a219748595f4b16c3e Mon Sep 17 00:00:00 2001
From: ronanstokes-db <ronan.stokes@databricks.com>
Date: Tue, 9 May 2023 12:10:53 -0700
Subject: [PATCH 04/20] wip

---
 .github/workflows/push.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/push.yml b/.github/workflows/push.yml
index fbdfa548..8192e3a9 100644
--- a/.github/workflows/push.yml
+++ b/.github/workflows/push.yml
@@ -29,7 +29,7 @@ jobs:
       #       ${{ runner.os }}-go-
 
       - name: Set up Python 3.8
-        uses: actions/setup-python@v3
+        uses: actions/setup-python@v4
         with:
           python-version: '3.8.10'
           cache: 'pipenv'

From ba929ac892c79b8fe1f5469d05ec2f2bd1f38578 Mon Sep 17 00:00:00 2001
From: ronanstokes-db <ronan.stokes@databricks.com>
Date: Tue, 9 May 2023 12:13:06 -0700
Subject: [PATCH 05/20] wip

---
 .github/workflows/push.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/push.yml b/.github/workflows/push.yml
index 8192e3a9..5e9a736b 100644
--- a/.github/workflows/push.yml
+++ b/.github/workflows/push.yml
@@ -31,7 +31,7 @@ jobs:
       - name: Set up Python 3.8
         uses: actions/setup-python@v4
         with:
-          python-version: '3.8.10'
+          python-version: '3.8.12'
           cache: 'pipenv'
 
       - name: Install 

From 03245c392b71eb42b538e6489d97e99d68b8ac29 Mon Sep 17 00:00:00 2001
From: ronanstokes-db <ronan.stokes@databricks.com>
Date: Tue, 9 May 2023 12:15:05 -0700
Subject: [PATCH 06/20] wip

---
 .github/workflows/push.yml | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/.github/workflows/push.yml b/.github/workflows/push.yml
index 5e9a736b..1a41f6b2 100644
--- a/.github/workflows/push.yml
+++ b/.github/workflows/push.yml
@@ -34,6 +34,9 @@ jobs:
           python-version: '3.8.12'
           cache: 'pipenv'
 
+      - name: Install pip
+        run: python -m pip install --upgrade pip
+          
       - name: Install 
         run: pip install pipenv
 

From 4b94465526facd93cd93933e7ae1ebedd51ff879 Mon Sep 17 00:00:00 2001
From: ronanstokes-db <ronan.stokes@databricks.com>
Date: Tue, 9 May 2023 12:17:24 -0700
Subject: [PATCH 07/20] wip

---
 .github/workflows/push.yml | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/push.yml b/.github/workflows/push.yml
index 1a41f6b2..de6893f4 100644
--- a/.github/workflows/push.yml
+++ b/.github/workflows/push.yml
@@ -28,15 +28,15 @@ jobs:
       #     restore-keys: |
       #       ${{ runner.os }}-go-
 
-      - name: Set up Python 3.8
+      - name: Set up Python 3.9
         uses: actions/setup-python@v4
         with:
-          python-version: '3.8.12'
+          python-version: '3.9'
           cache: 'pipenv'
 
       - name: Install pip
         run: python -m pip install --upgrade pip
-          
+
       - name: Install 
         run: pip install pipenv
 

From ff1574c359908d67a2e7a2b196ce1db1b2d40225 Mon Sep 17 00:00:00 2001
From: ronanstokes-db <ronan.stokes@databricks.com>
Date: Tue, 9 May 2023 12:20:11 -0700
Subject: [PATCH 08/20] wip

---
 .github/workflows/push.yml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/push.yml b/.github/workflows/push.yml
index de6893f4..699bb31e 100644
--- a/.github/workflows/push.yml
+++ b/.github/workflows/push.yml
@@ -8,11 +8,11 @@ on:
 
 jobs:
   tests:
-    # Ubuntu latest no longer installs Python 3.8 by default so install it
+    # Ubuntu latest no longer installs Python 3.9 by default so install it
     runs-on: ubuntu-latest
     steps:
       - name: Checkout
-        uses: actions/checkout@v2
+        uses: actions/checkout@v3
 
       - name: Unshallow
         run: git fetch --prune --unshallow

From 97616edd58265b96b996d5c258c2776434472cf7 Mon Sep 17 00:00:00 2001
From: ronanstokes-db <ronan.stokes@databricks.com>
Date: Tue, 9 May 2023 12:25:19 -0700
Subject: [PATCH 09/20] wip

---
 .github/workflows/push.yml | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/.github/workflows/push.yml b/.github/workflows/push.yml
index 699bb31e..3d84fe78 100644
--- a/.github/workflows/push.yml
+++ b/.github/workflows/push.yml
@@ -13,6 +13,8 @@ jobs:
     steps:
       - name: Checkout
         uses: actions/checkout@v3
+        with:
+          fetch-depth: 0
 
       - name: Unshallow
         run: git fetch --prune --unshallow
@@ -34,6 +36,9 @@ jobs:
           python-version: '3.9'
           cache: 'pipenv'
 
+      - name: Check Python version
+        run: python --version
+
       - name: Install pip
         run: python -m pip install --upgrade pip
 

From 064ab0ae28488f152e059c3ac2a1ec55ea79b85d Mon Sep 17 00:00:00 2001
From: ronanstokes-db <ronan.stokes@databricks.com>
Date: Tue, 9 May 2023 12:26:36 -0700
Subject: [PATCH 10/20] wip

---
 .github/workflows/push.yml | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/.github/workflows/push.yml b/.github/workflows/push.yml
index 3d84fe78..69a2802a 100644
--- a/.github/workflows/push.yml
+++ b/.github/workflows/push.yml
@@ -16,9 +16,6 @@ jobs:
         with:
           fetch-depth: 0
 
-      - name: Unshallow
-        run: git fetch --prune --unshallow
-
       # - name: Cache packages
       #   uses: actions/cache@v2
       #   with:

From 613bf9c2155cbb6299c60b7df0ea14912a222038 Mon Sep 17 00:00:00 2001
From: ronanstokes-db <ronan.stokes@databricks.com>
Date: Tue, 9 May 2023 12:31:35 -0700
Subject: [PATCH 11/20] wip

---
 .github/workflows/push.yml | 4 ++--
 Pipfile                    | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/push.yml b/.github/workflows/push.yml
index 69a2802a..5e9768c4 100644
--- a/.github/workflows/push.yml
+++ b/.github/workflows/push.yml
@@ -27,10 +27,10 @@ jobs:
       #     restore-keys: |
       #       ${{ runner.os }}-go-
 
-      - name: Set up Python 3.9
+      - name: Set up Python 3.8
         uses: actions/setup-python@v4
         with:
-          python-version: '3.9'
+          python-version: '3.8.12'
           cache: 'pipenv'
 
       - name: Check Python version
diff --git a/Pipfile b/Pipfile
index e6c00c15..0ac73774 100644
--- a/Pipfile
+++ b/Pipfile
@@ -28,4 +28,4 @@ pyparsing = "==2.4.7"
 jmespath = "==0.10.0"
 
 [requires]
-python_version = ">=3.8.10"
+python_version = "==3.8.12"

From 1b90eb367e3dccb347f8440afde59a13dc50485e Mon Sep 17 00:00:00 2001
From: ronanstokes-db <ronan.stokes@databricks.com>
Date: Tue, 9 May 2023 12:34:21 -0700
Subject: [PATCH 12/20] wip

---
 Pipfile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Pipfile b/Pipfile
index 0ac73774..ee4d7609 100644
--- a/Pipfile
+++ b/Pipfile
@@ -28,4 +28,4 @@ pyparsing = "==2.4.7"
 jmespath = "==0.10.0"
 
 [requires]
-python_version = "==3.8.12"
+python_version = "3.8.12"

From 65f4d54e35ac88bce293fd1aeada5b400fae1b4c Mon Sep 17 00:00:00 2001
From: ronanstokes-db <ronan.stokes@databricks.com>
Date: Tue, 9 May 2023 12:53:49 -0700
Subject: [PATCH 13/20] updated build version

---
 .github/workflows/onrelease.yml | 13 ++++++++-----
 CONTRIBUTING.md                 |  6 +++---
 README.md                       |  2 +-
 dbldatagen/_version.py          |  2 +-
 docs/source/conf.py             |  2 +-
 python/.bumpversion.cfg         |  2 +-
 setup.py                        |  2 +-
 7 files changed, 16 insertions(+), 13 deletions(-)

diff --git a/.github/workflows/onrelease.yml b/.github/workflows/onrelease.yml
index 479a0cbf..3827a3d6 100644
--- a/.github/workflows/onrelease.yml
+++ b/.github/workflows/onrelease.yml
@@ -18,15 +18,18 @@ jobs:
       - name: Checkout
         uses: actions/checkout@v2
 
-      - name: Unshallow
-        run: git fetch --prune --unshallow
-
       - name: Set up Python 3.8
-        uses: actions/setup-python@v3
+        uses: actions/setup-python@v4
         with:
-          python-version: '3.8'
+          python-version: '3.8.12'
           cache: 'pipenv'
 
+      - name: Check Python version
+        run: python --version
+
+      - name: Install pip
+        run: python -m pip install --upgrade pip
+
       - name: Install
         run: pip install pipenv
 
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index 2d20cbd6..9402e2d4 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -19,10 +19,10 @@ Dependent packages are not installed automatically by the `dbldatagen` package.
 
 ## Python compatibility
 
-The code has been tested with Python 3.8.10 and later.
+The code has been tested with Python 3.8.12 and later.
 
-Older releases were tested with Python 3.7.5 but as of this release, it requires the Databricks runtime 9.1 LTS or later
-which relies on Python 3.8.10
+Older releases were tested with Python 3.7.5 but as of this release, it requires the Databricks 
+runtime 9.1 LTS or later. 
 
 ## Checking your code for common issues
 
diff --git a/README.md b/README.md
index c15236a1..95df90ee 100644
--- a/README.md
+++ b/README.md
@@ -63,7 +63,7 @@ details of use and many examples.
 
 Release notes and details of the latest changes for this specific release
 can be found in the GitHub repository
-[here](https://github.com/databrickslabs/dbldatagen/blob/release/v0.3.4post2/CHANGELOG.md)
+[here](https://github.com/databrickslabs/dbldatagen/blob/release/v0.3.4post3/CHANGELOG.md)
 
 # Installation
 
diff --git a/dbldatagen/_version.py b/dbldatagen/_version.py
index d432695b..3c65fd7a 100644
--- a/dbldatagen/_version.py
+++ b/dbldatagen/_version.py
@@ -34,7 +34,7 @@ def get_version(version):
     return version_info
 
 
-__version__ = "0.3.4post2"  # DO NOT EDIT THIS DIRECTLY!  It is managed by bumpversion
+__version__ = "0.3.4post3"  # DO NOT EDIT THIS DIRECTLY!  It is managed by bumpversion
 __version_info__ = get_version(__version__)
 
 
diff --git a/docs/source/conf.py b/docs/source/conf.py
index 4ae0871e..80d91087 100644
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@@ -28,7 +28,7 @@
 author = 'Databricks Inc'
 
 # The full version, including alpha/beta/rc tags
-release = "0.3.4post2"  # DO NOT EDIT THIS DIRECTLY!  It is managed by bumpversion
+release = "0.3.4post3"  # DO NOT EDIT THIS DIRECTLY!  It is managed by bumpversion
 
 
 # -- General configuration ---------------------------------------------------
diff --git a/python/.bumpversion.cfg b/python/.bumpversion.cfg
index 5c7c1e86..365a83e3 100644
--- a/python/.bumpversion.cfg
+++ b/python/.bumpversion.cfg
@@ -1,5 +1,5 @@
 [bumpversion]
-current_version = 0.3.4post2
+current_version = 0.3.4post3
 commit = False
 tag = False
 parse = (?P<major>\d+)\.(?P<minor>\d+)\.(?P<patch>\d+){0,1}(?P<release>\D*)(?P<build>\d*)
diff --git a/setup.py b/setup.py
index 34375fb3..91f3fb1c 100644
--- a/setup.py
+++ b/setup.py
@@ -31,7 +31,7 @@
 
 setuptools.setup(
     name="dbldatagen",
-    version="0.3.4post2",
+    version="0.3.4post3",
     author="Ronan Stokes, Databricks",
     description="Databricks Labs -  PySpark Synthetic Data Generator",
     long_description=long_description,

From c81cc669a2091a09a71e7603573ed451303c6436 Mon Sep 17 00:00:00 2001
From: ronanstokes-db <ronan.stokes@databricks.com>
Date: Tue, 9 May 2023 12:56:05 -0700
Subject: [PATCH 14/20] updated build version

---
 CHANGELOG.md | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index bc48ca45..afb34eee 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -8,6 +8,10 @@ All notable changes to the Databricks Labs Data Generator will be documented in
 #### Changed
 * Added formatting of generated code as Html for script methods
 
+### Version 0.3.4 Post 3
+
+### Changed
+* Build now uses Python 3.8.12. Updated build process to reflect that.
 
 ### Version 0.3.4 Post 2
 

From 794d88df521c09dcc9265933919443ed0aacce3d Mon Sep 17 00:00:00 2001
From: ronanstokes-db <ronan.stokes@databricks.com>
Date: Tue, 9 May 2023 14:20:35 -0700
Subject: [PATCH 15/20] wip

---
 dbldatagen/text_generators.py | 25 ++++++++++++++++++++++---
 tests/test_text_generation.py |  2 ++
 2 files changed, 24 insertions(+), 3 deletions(-)

diff --git a/dbldatagen/text_generators.py b/dbldatagen/text_generators.py
index 965350be..9c3c6b0e 100644
--- a/dbldatagen/text_generators.py
+++ b/dbldatagen/text_generators.py
@@ -60,6 +60,12 @@
                 'PROIDENT', 'SUNT', 'IN', 'CULPA', 'QUI', 'OFFICIA', 'DESERUNT', 'MOLLIT', 'ANIM', 'ID', 'EST',
                 'LABORUM']
 
+# spacing smbols - note that order is important
+SPACE_SYMBOLS = ["\n\n", " ", ""]
+
+# default punctuation symbols
+DEFAULT_PUNCTUATION = [". "]
+
 
 class TextGenerator(object):
     """ Base class for text generation classes
@@ -668,10 +674,16 @@ class ILText(TextGenerator):  # lgtm [py/missing-equals]
     :param paragraphs: Number of paragraphs to generate. If tuple will generate random number in range
     :param sentences:  Number of sentences to generate. If tuple will generate random number in tuple range
     :param words:  Number of words per sentence to generate. If tuple, will generate random number in tuple range
+    :param punctuation:  List of strings or single string of punctuation to use
+
+    If `punctuation` is not specified, the following punctuation will be used: [". "]
+
+    Note that punctuation will only be used at end of each logical sentence, not at end of each word.
+
 
     """
 
-    def __init__(self, paragraphs=None, sentences=None, words=None, extendedWordList=None):
+    def __init__(self, paragraphs=None, sentences=None, words=None, extendedWordList=None, punctuation=None):
         """
         Initialize the ILText with text generation parameters
         """
@@ -685,6 +697,7 @@ def __init__(self, paragraphs=None, sentences=None, words=None, extendedWordList
         self.sentences = self.getAsTupleOrElse(sentences, (1, 1), "sentences")
         self.wordList = extendedWordList if extendedWordList is not None else _WORDS_LOWER
         self.shape = [self.paragraphs[1], self.sentences[1], self.words[1]]
+        self.punctuation = np.array(punctuation) if punctuation is not None else np.array(DEFAULT_PUNCTUATION)
 
         # values needed for the text generation
         # numpy uses fixed sizes for strings , so compute whats needed
@@ -717,11 +730,15 @@ def _processWordList(self):
 
         self._wordOffsetSize = all_words.size
         self._sentenceEndOffset = all_words.size
-        self._paragraphEnd = self._sentenceEndOffset + 1
+
+        punctuation = []
+        punctuation.extend(self.punctuation)
+        punctuation.extend(SPACE_SYMBOLS)
+
+        self._paragraphEnd = self._sentenceEndOffset + len(self.punctuation)
         self._wordSpaceOffset = self._paragraphEnd + 1
         self._emptyStringOffset = self._wordSpaceOffset + 1
 
-        punctuation = [". ", "\n\n", " ", ""]
         all_words = np.concatenate((all_words, punctuation))
 
         self._startOfCapitalsOffset = all_words.size
@@ -820,6 +837,8 @@ def generateText(self, baseValues, rowCount=1):
         new_col = new_word_offsets[:, :, :, np.newaxis]
         terminated_word_offsets = np.ma.concatenate((masked_offsets, new_col), axis=3)
         new_column = terminated_word_offsets[:, :, :, -1]
+
+        # TODO: modify to add punctuation to end of sentences
         new_column[~new_column.mask] = self._sentenceEndOffset
 
         # reshape to paragraphs
diff --git a/tests/test_text_generation.py b/tests/test_text_generation.py
index fb23d9d3..374449cc 100644
--- a/tests/test_text_generation.py
+++ b/tests/test_text_generation.py
@@ -272,6 +272,8 @@ def test_small_ILText_driven_data_generation(self):
 
         df_iltext_data = testDataSpec.build()
 
+        df_iltext_data.show()
+
         counts = df_iltext_data.agg(
             F.countDistinct("paras").alias("paragraphs_count")
         ).collect()[0]

From 9ff9cfef510d366bb75cb0be230844cfad601629 Mon Sep 17 00:00:00 2001
From: ronanstokes-db <ronan.stokes@databricks.com>
Date: Tue, 9 May 2023 14:27:18 -0700
Subject: [PATCH 16/20] wip

---
 dbldatagen/text_generatestring.py | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 create mode 100644 dbldatagen/text_generatestring.py

diff --git a/dbldatagen/text_generatestring.py b/dbldatagen/text_generatestring.py
new file mode 100644
index 00000000..e69de29b

From 61115601f0b3fb0915812f5d640fb1e9948d11ee Mon Sep 17 00:00:00 2001
From: ronanstokes-db <ronan.stokes@databricks.com>
Date: Tue, 9 May 2023 14:32:15 -0700
Subject: [PATCH 17/20] wip

---
 dbldatagen/__init__.py               | 3 ++-
 dbldatagen/column_generation_spec.py | 2 +-
 dbldatagen/text_generators.py        | 8 ++++++++
 3 files changed, 11 insertions(+), 2 deletions(-)

diff --git a/dbldatagen/__init__.py b/dbldatagen/__init__.py
index 49eea723..69956112 100644
--- a/dbldatagen/__init__.py
+++ b/dbldatagen/__init__.py
@@ -41,12 +41,13 @@
 from .spark_singleton import SparkSingleton
 from .text_generators import TemplateGenerator, ILText, TextGenerator
 from .text_generator_plugins import PyfuncText, PyfuncTextFactory, FakerTextFactory, fakerText
+from .text_generatestring import GenerateString
 from .html_utils import HtmlUtils
 
 __all__ = ["data_generator", "data_analyzer", "schema_parser", "daterange", "nrange",
            "column_generation_spec", "utils", "function_builder",
            "spark_singleton", "text_generators", "datarange", "datagen_constants",
-           "text_generator_plugins", "html_utils"
+           "text_generator_plugins", "html_utils", "text_generatestring"
            ]
 
 
diff --git a/dbldatagen/column_generation_spec.py b/dbldatagen/column_generation_spec.py
index 6456a389..e77ca977 100644
--- a/dbldatagen/column_generation_spec.py
+++ b/dbldatagen/column_generation_spec.py
@@ -1107,7 +1107,7 @@ def _applyPrefixSuffixExpressions(self, cprefix, csuffix, new_def):
             new_def = concat(new_def.astype(IntegerType()), lit(text_separator), lit(csuffix))
         return new_def
 
-    def _applyTextGenerationExpression(self, new_def, use_pandas_optimizations):
+    def _applyTextGenerationExpression(self, new_def, use_pandas_optimizations=True):
         """Apply text generation expression to column expression
 
         :param new_def : column definition being created
diff --git a/dbldatagen/text_generators.py b/dbldatagen/text_generators.py
index 9c3c6b0e..b76a36fd 100644
--- a/dbldatagen/text_generators.py
+++ b/dbldatagen/text_generators.py
@@ -167,6 +167,14 @@ def getAsTupleOrElse(v, defaultValue, valueName):
 
         return defaultValue
 
+    def prepareBaseValue(self, baseDef):
+        """ Prepare the base value for processing
+        :param baseDef: base value expression
+        :return: base value expression unchanged
+        Derived classes are expected to override this if needed
+        """
+        return baseDef
+
 
 class TemplateGenerator(TextGenerator):  # lgtm [py/missing-equals]
     """This class handles the generation of text from templates

From 16284e0cb3da189418963b3155cb10f841d34094 Mon Sep 17 00:00:00 2001
From: ronanstokes-db <ronan.stokes@databricks.com>
Date: Thu, 1 Jun 2023 01:37:22 -0700
Subject: [PATCH 18/20] added GenerateStr and html output to data analyzer

---
 CHANGELOG.md                         |   1 +
 dbldatagen/__init__.py               |   2 +-
 dbldatagen/column_generation_spec.py |   3 +
 dbldatagen/data_analyzer.py          |  45 ++++---
 dbldatagen/text_generatestring.py    | 181 +++++++++++++++++++++++++++
 dbldatagen/text_generators.py        |  27 +---
 docs/utils/mk_quick_index.py         |   2 +
 tests/test_text_generatestring.py    |  99 +++++++++++++++
 8 files changed, 323 insertions(+), 37 deletions(-)
 create mode 100644 tests/test_text_generatestring.py

diff --git a/CHANGELOG.md b/CHANGELOG.md
index afb34eee..d55f6401 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -7,6 +7,7 @@ All notable changes to the Databricks Labs Data Generator will be documented in
 
 #### Changed
 * Added formatting of generated code as Html for script methods
+* Added text generator `GenerateString`
 
 ### Version 0.3.4 Post 3
 
diff --git a/dbldatagen/__init__.py b/dbldatagen/__init__.py
index 69956112..101b3f84 100644
--- a/dbldatagen/__init__.py
+++ b/dbldatagen/__init__.py
@@ -47,7 +47,7 @@
 __all__ = ["data_generator", "data_analyzer", "schema_parser", "daterange", "nrange",
            "column_generation_spec", "utils", "function_builder",
            "spark_singleton", "text_generators", "datarange", "datagen_constants",
-           "text_generator_plugins", "html_utils", "text_generatestring"
+           "text_generator_plugins", "html_utils", "text_generatestring", "value_based_prng"
            ]
 
 
diff --git a/dbldatagen/column_generation_spec.py b/dbldatagen/column_generation_spec.py
index e77ca977..da9c99f1 100644
--- a/dbldatagen/column_generation_spec.py
+++ b/dbldatagen/column_generation_spec.py
@@ -1118,6 +1118,9 @@ def _applyTextGenerationExpression(self, new_def, use_pandas_optimizations=True)
         # while it seems like this could use a shared instance, this does not work if initialized
         # in a class method
         tg = self.textGenerator
+
+        new_def = tg.prepareBaseValue(new_def)
+
         if use_pandas_optimizations:
             self.executionHistory.append(f".. text generation via pandas scalar udf `{tg}`")
             u_value_from_generator = pandas_udf(tg.pandasGenerateText,
diff --git a/dbldatagen/data_analyzer.py b/dbldatagen/data_analyzer.py
index 5aec5245..ecacd95c 100644
--- a/dbldatagen/data_analyzer.py
+++ b/dbldatagen/data_analyzer.py
@@ -14,6 +14,7 @@
 import pyspark.sql.functions as F
 
 from .utils import strip_margins
+from .html_utils import HtmlUtils
 from .spark_singleton import SparkSingleton
 
 
@@ -359,7 +360,7 @@ def _scriptDataGeneratorCode(cls, schema, dataSummary=None, sourceDf=None, suppr
         return "\n".join(stmts)
 
     @classmethod
-    def scriptDataGeneratorFromSchema(cls, schema, suppressOutput=False, name=None):
+    def scriptDataGeneratorFromSchema(cls, schema, suppressOutput=False, name=None, asHtml=False):
         """
         Generate outline data generator code from an existing dataframe
 
@@ -373,16 +374,24 @@ def scriptDataGeneratorFromSchema(cls, schema, suppressOutput=False, name=None):
         The dataframe to be analyzed is the dataframe passed to the constructor of the DataAnalyzer object.
 
         :param schema: Pyspark schema - i.e manually constructed StructType or return value from `dataframe.schema`
-        :param suppressOutput: Suppress printing of generated code if True
+        :param suppressOutput: Suppress printing of generated code if True. If asHtml is True, output is suppressed
         :param name: Optional name for data generator
-        :return: String containing skeleton code
+        :param asHtml: If True, will generate Html suitable for notebook ``displayHtml``.
+        :return: String containing skeleton code (in Html form if `asHtml` is True)
 
         """
-        return cls._scriptDataGeneratorCode(schema,
-                                            suppressOutput=suppressOutput,
-                                            name=name)
+        omit_output_printing = suppressOutput or asHtml
+
+        generated_code = cls._scriptDataGeneratorCode(schema,
+                                                      suppressOutput=omit_output_printing,
+                                                      name=name)
+
+        if asHtml:
+            generated_code = HtmlUtils.formatCodeAsHtml(generated_code)
+
+        return generated_code
 
-    def scriptDataGeneratorFromData(self, suppressOutput=False, name=None):
+    def scriptDataGeneratorFromData(self, suppressOutput=False, name=None, asHtml=False):
         """
         Generate outline data generator code from an existing dataframe
 
@@ -395,14 +404,17 @@ def scriptDataGeneratorFromData(self, suppressOutput=False, name=None):
 
         The dataframe to be analyzed is the Spark dataframe passed to the constructor of the DataAnalyzer object
 
-        :param suppressOutput: Suppress printing of generated code if True
+        :param suppressOutput: Suppress printing of generated code if True. If asHtml is True, output is suppressed
         :param name: Optional name for data generator
-        :return: String containing skeleton code
+        :param asHtml: If True, will generate Html suitable for notebook ``displayHtml``.
+        :return: String containing skeleton code (in Html form if `asHtml` is True)
 
         """
         assert self._df is not None
         assert type(self._df) is ssql.DataFrame, "sourceDf must be a valid Pyspark dataframe"
 
+        omit_output_printing = suppressOutput or asHtml
+
         if self._dataSummary is None:
             df_summary = self.summarizeToDF()
 
@@ -411,8 +423,13 @@ def scriptDataGeneratorFromData(self, suppressOutput=False, name=None):
                 row_key_pairs = row.asDict()
                 self._dataSummary[row['measure_']] = row_key_pairs
 
-        return self._scriptDataGeneratorCode(self._df.schema,
-                                             suppressOutput=suppressOutput,
-                                             name=name,
-                                             dataSummary=self._dataSummary,
-                                             sourceDf=self._df)
+        generated_code = self._scriptDataGeneratorCode(self._df.schema,
+                                                      suppressOutput=omit_output_printing,
+                                                      name=name,
+                                                      dataSummary=self._dataSummary,
+                                                      sourceDf=self._df)
+
+        if asHtml:
+            generated_code = HtmlUtils.formatCodeAsHtml(generated_code)
+
+        return generated_code
diff --git a/dbldatagen/text_generatestring.py b/dbldatagen/text_generatestring.py
index e69de29b..fdb561bd 100644
--- a/dbldatagen/text_generatestring.py
+++ b/dbldatagen/text_generatestring.py
@@ -0,0 +1,181 @@
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+"""
+This file defines the RandomStr text generator
+"""
+
+import math
+import random
+
+import numpy as np
+import pandas as pd
+
+import pyspark.sql.functions as F
+
+from .text_generators import TextGenerator
+from .text_generators import _DIGITS_ZERO, _LETTERS_UPPER, _LETTERS_LOWER, _LETTERS_ALL
+
+
+class GenerateString(TextGenerator):  # lgtm [py/missing-equals]
+    """This class handles the generation of string text of specified length drawn from alphanumeric characters.
+
+    The set of chars to be used can be modified based on the parameters
+
+    This will generate deterministic strings chosen from the pool of characters `0-9`, `a-z`, `A-Z`, or from a
+    custom character range if specified.
+
+    :param length: length of string. Can be integer, or tuple (min, max)
+    :param leadingAlpha:  If True, leading character will be in range a-zAA-Z
+    :param allUpper:  If True, any alpha chars will be uppercase
+    :param allLower:  If True, any alpha chars will be lowercase
+    :param allAlpha:  If True, all chars will be non numeric
+    :param customChars:  If supplied, specifies a list of chars to use, or string of chars to use.
+
+    This method will generate deterministic strings varying in size from `minLength` to `maxLength`.
+    The characters chosen will be in the range 0-9`, `a-z`, `A-Z` unless modified using the `leadingAlpha`,
+    `allUpper`, `allLower`, `allAlpha` or `customChars` parameters.
+
+    The modifiers can be combined - for example GenerateString(1, 5, leadingAlpha=True, allUpper=True)
+
+    When the length is specified to be a tuple, it wll generate variable length strings of lengths from the lower bound
+    to the upper bound inclusive.
+
+    The strings are generated deterministically so that they can be used for predictable primary and foreign keys.
+
+    If the column definition that includes this specifies `random` then the string generation will be determined by a
+    seeded random number according to the rules for random numbers and random seeds used in other columns
+
+    If random is false, then the string will be generated from a pseudo random sequence generated purely from the
+    SQL hash of the `baseColumns`
+
+    .. note::
+       If customChars are specified, then the flag `allAlpha` will only remove digits.
+
+    """
+
+    def __init__(self, length, leadingAlpha=True, allUpper=False, allLower=False, allAlpha=False, customChars=None):
+        super().__init__()
+
+        assert not customChars or isinstance(customChars, (list, str)), \
+            "`customChars` should be list of characters or string containing custom chars"
+
+        assert not allUpper or not allLower, "allUpper and allLower cannot both be True"
+
+        if isinstance(customChars, str):
+            assert len(customChars) > 0, "string of customChars must be non-empty"
+        elif isinstance(customChars, list):
+            assert all(isinstance(c, str) for c in customChars)
+            assert len(customChars) > 0, "list of customChars must be non-empty"
+
+        self.leadingAlpha = leadingAlpha
+        self.allUpper = allUpper
+        self.allLower = allLower
+        self.allAlpha = allAlpha
+
+        # determine base alphabet
+        if isinstance(customChars, list):
+            charAlphabet = set("".join(customChars))
+        elif isinstance(customChars, str):
+            charAlphabet = set(customChars)
+        else:
+            charAlphabet = set(_LETTERS_ALL).union(set(_DIGITS_ZERO))
+
+        if allLower:
+            charAlphabet = charAlphabet.difference(set(_LETTERS_UPPER))
+        elif allUpper:
+            charAlphabet = charAlphabet.difference(set(_LETTERS_LOWER))
+
+        if allAlpha:
+            charAlphabet = charAlphabet.difference(set(_DIGITS_ZERO))
+
+        self._charAlphabet = np.array(list(charAlphabet))
+
+        if leadingAlpha:
+            self._firstCharAlphabet = np.array(list(charAlphabet.difference(set(_DIGITS_ZERO))))
+        else:
+            self._firstCharAlphabet = self._charAlphabet
+
+        # compute string lengths
+        if isinstance(length, int):
+            self._minLength = length
+            self._maxLength = length
+        elif isinstance(length, tuple):
+            assert len(length) == 2, "only 2 elements can be specified if length is a tuple"
+            assert all(isinstance(el, int) for el in length)
+            self._minLength, self._maxLength = length
+        else:
+            raise ValueError("`length` must be an integer or a tuple of two integers")
+
+        # compute bounds for generated strings
+        bounds = [len(self._firstCharAlphabet)]
+        for ix in range(1, self._maxLength):
+            bounds.append(len(self._charAlphabet))
+
+        self._bounds = bounds
+
+    def __repr__(self):
+        return f"GenerateString(length={(self._minLength, self._maxLength)}, leadingAlpha={self.leadingAlpha})"
+
+    def make_variable_length_mask(self, v, lengths):
+        """ given 2-d array of dimensions[r, c] and lengths of dimensions[r]
+
+           generate mask for each row where col_index[r,c] < lengths[r]
+        """
+        print(v.shape, lengths.shape)
+        assert v.shape[0] == lengths.shape[0], "values and lengths must agree on dimension 0]"
+        _, c_ix = np.indices(v.shape)
+
+        return (c_ix.T < lengths.T).T
+
+    def mk_bounds(self, v, minLength, maxLength):
+        rng = np.random.default_rng(42)
+        v_bounds = np.full(v.shape[0], (maxLength - minLength) + 1)
+        return rng.integers(v_bounds) + minLength
+
+    def prepareBaseValue(self, baseDef):
+        """ Prepare the base value for processing
+
+        :param baseDef: base value expression
+        :return: base value expression unchanged
+
+        For generate string processing , we'll use the SQL function abs(hash(baseDef)
+
+        This will ensure that even if there are multiple base values, only a single value is passed to the UDF
+        """
+        return F.abs(F.hash(baseDef))
+
+    def pandasGenerateText(self, v):
+        """ entry point to use for pandas udfs
+
+        Implementation uses vectorized implementation of process
+
+        :param v: Pandas series of values passed as base values
+        :return: Pandas series of expanded templates
+
+        """
+        # placeholders is numpy array used to hold results
+
+        rnds = np.full((v.shape[0], self._maxLength), len(self._charAlphabet), dtype=np.object_)
+
+        rng = self.getNPRandomGenerator()
+        rnds2 = rng.integers(rnds)
+
+        placeholders = np.full((v.shape[0], self._maxLength), '', dtype=np.object_)
+
+        lengths = v.to_numpy() % (self._maxLength - self._minLength) + self._minLength
+
+        v1 = np.full((v.shape[0], self._maxLength), -1)
+
+        placeholder_mask = self.make_variable_length_mask(placeholders, lengths)
+        masked_placeholders = np.ma.MaskedArray(placeholders, mask=placeholder_mask)
+
+        masked_placeholders[~placeholder_mask] = self._charAlphabet[rnds2[~placeholder_mask]]
+
+        output = pd.Series(list(placeholders))
+
+        # join strings in placeholders
+        results = output.apply(lambda placeholder_items: "".join([str(elem) for elem in placeholder_items]))
+
+        return results
diff --git a/dbldatagen/text_generators.py b/dbldatagen/text_generators.py
index b76a36fd..403e06d8 100644
--- a/dbldatagen/text_generators.py
+++ b/dbldatagen/text_generators.py
@@ -60,12 +60,6 @@
                 'PROIDENT', 'SUNT', 'IN', 'CULPA', 'QUI', 'OFFICIA', 'DESERUNT', 'MOLLIT', 'ANIM', 'ID', 'EST',
                 'LABORUM']
 
-# spacing smbols - note that order is important
-SPACE_SYMBOLS = ["\n\n", " ", ""]
-
-# default punctuation symbols
-DEFAULT_PUNCTUATION = [". "]
-
 
 class TextGenerator(object):
     """ Base class for text generation classes
@@ -169,8 +163,10 @@ def getAsTupleOrElse(v, defaultValue, valueName):
 
     def prepareBaseValue(self, baseDef):
         """ Prepare the base value for processing
+
         :param baseDef: base value expression
         :return: base value expression unchanged
+
         Derived classes are expected to override this if needed
         """
         return baseDef
@@ -682,16 +678,10 @@ class ILText(TextGenerator):  # lgtm [py/missing-equals]
     :param paragraphs: Number of paragraphs to generate. If tuple will generate random number in range
     :param sentences:  Number of sentences to generate. If tuple will generate random number in tuple range
     :param words:  Number of words per sentence to generate. If tuple, will generate random number in tuple range
-    :param punctuation:  List of strings or single string of punctuation to use
-
-    If `punctuation` is not specified, the following punctuation will be used: [". "]
-
-    Note that punctuation will only be used at end of each logical sentence, not at end of each word.
-
 
     """
 
-    def __init__(self, paragraphs=None, sentences=None, words=None, extendedWordList=None, punctuation=None):
+    def __init__(self, paragraphs=None, sentences=None, words=None, extendedWordList=None):
         """
         Initialize the ILText with text generation parameters
         """
@@ -705,7 +695,6 @@ def __init__(self, paragraphs=None, sentences=None, words=None, extendedWordList
         self.sentences = self.getAsTupleOrElse(sentences, (1, 1), "sentences")
         self.wordList = extendedWordList if extendedWordList is not None else _WORDS_LOWER
         self.shape = [self.paragraphs[1], self.sentences[1], self.words[1]]
-        self.punctuation = np.array(punctuation) if punctuation is not None else np.array(DEFAULT_PUNCTUATION)
 
         # values needed for the text generation
         # numpy uses fixed sizes for strings , so compute whats needed
@@ -738,15 +727,11 @@ def _processWordList(self):
 
         self._wordOffsetSize = all_words.size
         self._sentenceEndOffset = all_words.size
-
-        punctuation = []
-        punctuation.extend(self.punctuation)
-        punctuation.extend(SPACE_SYMBOLS)
-
-        self._paragraphEnd = self._sentenceEndOffset + len(self.punctuation)
+        self._paragraphEnd = self._sentenceEndOffset + 1
         self._wordSpaceOffset = self._paragraphEnd + 1
         self._emptyStringOffset = self._wordSpaceOffset + 1
 
+        punctuation = [". ", "\n\n", " ", ""]
         all_words = np.concatenate((all_words, punctuation))
 
         self._startOfCapitalsOffset = all_words.size
@@ -845,8 +830,6 @@ def generateText(self, baseValues, rowCount=1):
         new_col = new_word_offsets[:, :, :, np.newaxis]
         terminated_word_offsets = np.ma.concatenate((masked_offsets, new_col), axis=3)
         new_column = terminated_word_offsets[:, :, :, -1]
-
-        # TODO: modify to add punctuation to end of sentences
         new_column[~new_column.mask] = self._sentenceEndOffset
 
         # reshape to paragraphs
diff --git a/docs/utils/mk_quick_index.py b/docs/utils/mk_quick_index.py
index c3d08953..524aaba9 100644
--- a/docs/utils/mk_quick_index.py
+++ b/docs/utils/mk_quick_index.py
@@ -33,6 +33,8 @@
                            "grouping": "main classes"},
     "text_generator_plugins.py": {"briefDesc": "Text data generation",
                                   "grouping": "main classes"},
+    "text_generatestring.py": {"briefDesc": "Text data generation",
+                           "grouping": "main classes"},
     "data_analyzer.py": {"briefDesc": "Analysis of existing data",
                          "grouping": "main classes"},
     "function_builder.py": {"briefDesc": "Internal utilities to create functions related to weights",
diff --git a/tests/test_text_generatestring.py b/tests/test_text_generatestring.py
new file mode 100644
index 00000000..9d6fc599
--- /dev/null
+++ b/tests/test_text_generatestring.py
@@ -0,0 +1,99 @@
+import pytest
+import pyspark.sql.functions as F
+from pyspark.sql.types import BooleanType, DateType
+from pyspark.sql.types import StructType, StructField, IntegerType, StringType, TimestampType
+
+import dbldatagen as dg
+
+spark = dg.SparkSingleton.getLocalInstance("unit tests")
+
+spark.conf.set("spark.sql.execution.arrow.maxRecordsPerBatch", "20000")
+spark.conf.set("spark.sql.execution.arrow.pyspark.enabled", "true")
+
+#: list of digits for template generation
+_DIGITS_ZERO = ['0', '1', '2', '3', '4', '5', '6', '7', '8', '9']
+
+#: list of uppercase letters for template generation
+_LETTERS_UPPER = ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P',
+                  'Q', 'R', 'T', 'S', 'U', 'V', 'W', 'X', 'Y', 'Z']
+
+#: list of lowercase letters for template generation
+_LETTERS_LOWER = ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q',
+                  'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z']
+
+#: list of all letters uppercase and lowercase
+_LETTERS_ALL = _LETTERS_LOWER + _LETTERS_UPPER
+
+#: list of alphanumeric chars in lowercase
+_ALNUM_LOWER = _LETTERS_LOWER + _DIGITS_ZERO
+
+#: list of alphanumeric chars in uppercase
+_ALNUM_UPPER = _LETTERS_UPPER + _DIGITS_ZERO
+
+
+# Test manipulation and generation of test data for a large schema
+class TestTextGenerateString:
+
+    @pytest.mark.parametrize("length, leadingAlpha, allUpper, allLower, allAlpha, customChars",
+                             [
+                                 (5, True, True, False, False, None),
+                                 (5, True, False, True, False, None),
+                                 (5, True, False, False, True, None),
+                                 (5, False, False, False, False, None),
+                                 (5, False, True, False, True, None),
+                                 (5, False, False, True, True, None),
+                                 (5, False, False, False, False, "01234567890ABCDEF"),
+                             ])
+    def test_basics(self, length, leadingAlpha, allUpper, allLower, allAlpha, customChars):
+
+        tg1 = dg.GenerateString(length, leadingAlpha=leadingAlpha, allUpper=allUpper, allLower=allLower,
+                                allAlpha=allAlpha, customChars=customChars)
+
+        assert tg1._charAlphabet is not None
+        assert tg1._firstCharAlphabet is not None
+
+        if allUpper and allAlpha:
+            alphabet = _LETTERS_UPPER
+        elif allLower and allAlpha:
+            alphabet = _LETTERS_LOWER
+        elif allLower:
+            alphabet = _LETTERS_LOWER + _DIGITS_ZERO
+        elif allUpper:
+            alphabet = _LETTERS_UPPER + _DIGITS_ZERO
+        elif allAlpha:
+            alphabet = _LETTERS_UPPER + _LETTERS_LOWER
+        else:
+            alphabet = _LETTERS_UPPER + _LETTERS_LOWER + _DIGITS_ZERO
+
+        if customChars is not None:
+            alphabet = set(alphabet).intersection(set(customChars))
+
+        assert set(tg1._charAlphabet) == set(alphabet)
+
+    @pytest.mark.parametrize("genstr",
+                             [
+                                 dg.GenerateString((1, 10)),
+                                 dg.GenerateString((1, 10), leadingAlpha=True),
+                                 dg.GenerateString((4, 64), allUpper=True),
+                                 dg.GenerateString((10, 20), allLower=True),
+                                 dg.GenerateString((1, 10)),
+                                 dg.GenerateString((3, 15)),
+                                 dg.GenerateString((17, 22)),
+                                 dg.GenerateString((1, 10)),
+                             ])
+    def test_simple_data(self, genstr):
+        dgspec = (dg.DataGenerator(sparkSession=spark, name="alt_data_set", rows=10000,
+                                   partitions=4, seedMethod='hash_fieldname', verbose=True,
+                                   seedColumnName="_id")
+                  .withIdOutput()
+                  .withColumn("code2", IntegerType(), min=0, max=10)
+                  .withColumn("code3", StringType(), values=['a', 'b', 'c'])
+                  .withColumn("code4", StringType(), values=['a', 'b', 'c'], random=True)
+                  .withColumn("code5", StringType(), text=dg.GenerateString((1, 10)))
+                  )
+
+        fieldsFromGenerator = set(dgspec.getOutputColumnNames())
+
+        df_testdata = dgspec.build()
+
+        df_testdata.show()

From df4dd558fafa340d3bebbe1ef6f45175f6c6cc4c Mon Sep 17 00:00:00 2001
From: ronanstokes-db <ronan.stokes@databricks.com>
Date: Thu, 1 Jun 2023 02:04:32 -0700
Subject: [PATCH 19/20] wip

---
 dbldatagen/__init__.py | 2 +-
 makefile               | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/dbldatagen/__init__.py b/dbldatagen/__init__.py
index 101b3f84..69956112 100644
--- a/dbldatagen/__init__.py
+++ b/dbldatagen/__init__.py
@@ -47,7 +47,7 @@
 __all__ = ["data_generator", "data_analyzer", "schema_parser", "daterange", "nrange",
            "column_generation_spec", "utils", "function_builder",
            "spark_singleton", "text_generators", "datarange", "datagen_constants",
-           "text_generator_plugins", "html_utils", "text_generatestring", "value_based_prng"
+           "text_generator_plugins", "html_utils", "text_generatestring"
            ]
 
 
diff --git a/makefile b/makefile
index 7c006b3e..d16fb159 100644
--- a/makefile
+++ b/makefile
@@ -89,11 +89,11 @@ dev-test:
 
 dev-lint-report:
 	@echo "$(OK_COLOR)=> Running Prospector lint reporting $(PWD) $(NO_COLOR)"
-	prospector --profile prospector.yaml > prospector_report.txt
+	prospector --profile prospector.yaml dbldatagen > prospector_report.txt
 
 dev-lint:
 	@echo "$(OK_COLOR)=> Running Prospector lint reporting $(PWD) $(NO_COLOR)"
-	prospector --profile prospector.yaml
+	prospector --profile prospector.yaml dbldatagen
 
 dev-test-with-html-report:
 	@echo "$(OK_COLOR)=> Running unit tests with HTML test coverage report$(NO_COLOR)"

From 0a93d0f646863db9041a08aaab5470751abcd3f0 Mon Sep 17 00:00:00 2001
From: ronanstokes-db <ronan.stokes@databricks.com>
Date: Thu, 1 Jun 2023 02:26:06 -0700
Subject: [PATCH 20/20] wip

---
 tests/test_generation_from_data.py | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/tests/test_generation_from_data.py b/tests/test_generation_from_data.py
index fab15809..25c36aa1 100644
--- a/tests/test_generation_from_data.py
+++ b/tests/test_generation_from_data.py
@@ -71,6 +71,9 @@ def test_code_generation1(self, generation_spec, setupLogging):
         ast_tree = ast.parse(generatedCode)
         assert ast_tree is not None
 
+        generatedCode2 = analyzer.scriptDataGeneratorFromData(asHtml=True)
+        assert generatedCode in generatedCode2
+
     def test_code_generation_from_schema(self, generation_spec, setupLogging):
         df_source_data = generation_spec.build()
         generatedCode = dg.DataAnalyzer.scriptDataGeneratorFromSchema(df_source_data.schema)
@@ -82,6 +85,10 @@ def test_code_generation_from_schema(self, generation_spec, setupLogging):
         ast_tree = ast.parse(generatedCode)
         assert ast_tree is not None
 
+        generatedCode2 = dg.DataAnalyzer.scriptDataGeneratorFromSchema(df_source_data.schema, asHtml=True)
+
+        assert generatedCode in generatedCode2
+
     def test_summarize(self, testLogger, generation_spec):
         testLogger.info("Building test data")