From aebbb06116184ead8e5f98a4c7588d75e5e3a54f Mon Sep 17 00:00:00 2001 From: ronanstokes-db Date: Sat, 25 Mar 2023 17:18:57 -0700 Subject: [PATCH 01/20] wip --- docs/source/extending_text_generation.rst | 6 +- docs/source/generating_cdc_data.rst | 53 ++++++------- docs/source/generating_json_data.rst | 91 +++++++++++----------- docs/source/multi_table_data.rst | 91 +++++++++++----------- docs/source/troubleshooting.rst | 93 ++++++++++++----------- 5 files changed, 174 insertions(+), 160 deletions(-) diff --git a/docs/source/extending_text_generation.rst b/docs/source/extending_text_generation.rst index 7f0df8a4..f59e7171 100644 --- a/docs/source/extending_text_generation.rst +++ b/docs/source/extending_text_generation.rst @@ -38,7 +38,7 @@ extended syntax. .withColumn("address", text=fakerText("address" )) .withColumn("email", text=fakerText("ascii_company_email") ) .withColumn("ip_address", text=fakerText("ipv4_private" )) - .withColumn("faker_text", text=fakerText("sentence", ext_word_list=my_word_list) ) + .withColumn("faker_text", text=fakerText("sentence", ext_word_list=my_word_list)) ) dfFakerOnly = fakerDataspec.build() @@ -91,7 +91,9 @@ The following code shows use of a custom Python function to generate text: pluginDataspec = (DataGenerator(spark, rows=data_rows, partitions=partitions_requested, randomSeedMethod="hash_fieldname") - .withColumn("text", text=PyfuncText(text_generator, initFn=initPluginContext)) + .withColumn("text", + text=PyfuncText(text_generator, + initFn=initPluginContext)) ) dfPlugin = pluginDataspec.build() diff --git a/docs/source/generating_cdc_data.rst b/docs/source/generating_cdc_data.rst index 1633ce41..ccbf16b4 100644 --- a/docs/source/generating_cdc_data.rst +++ b/docs/source/generating_cdc_data.rst @@ -1,7 +1,7 @@ .. Test Data Generator documentation master file, created by - sphinx-quickstart on Sun Jun 21 10:54:30 2020. - You can adapt this file completely to your liking, but it should at least - contain the root `toctree` directive. +sphinx-quickstart on Sun Jun 21 10:54:30 2020. +You can adapt this file completely to your liking, but it should at least +contain the root `toctree` directive. Generating Change Data Capture Data =================================== @@ -47,28 +47,30 @@ We'll add a timestamp for when the row was generated and a memo field to mark wh uniqueCustomers = 10 * 1000000 - dataspec = (dg.DataGenerator(spark, rows=data_rows, partitions=partitions_requested) - .withColumn("customer_id","long", uniqueValues=uniqueCustomers) - .withColumn("name", percentNulls=0.01, template=r'\\w \\w|\\w a. \\w') - .withColumn("alias", percentNulls=0.01, template=r'\\w \\w|\\w a. \\w') - .withColumn("payment_instrument_type", values=['paypal', 'Visa', 'Mastercard', - 'American Express', 'discover', 'branded visa', 'branded mastercard'], - random=True, distribution="normal") - .withColumn("int_payment_instrument", "int", minValue=0000, maxValue=9999, baseColumn="customer_id", - baseColumnType="hash", omit=True) - .withColumn("payment_instrument", expr="format_number(int_payment_instrument, '**** ****** *####')", - baseColumn="int_payment_instrument") - .withColumn("email", template=r'\\w.\\w@\\w.com|\\w-\\w@\\w') - .withColumn("email2", template=r'\\w.\\w@\\w.com') - .withColumn("ip_address", template=r'\\n.\\n.\\n.\\n') - .withColumn("md5_payment_instrument", - expr="md5(concat(payment_instrument_type, ':', payment_instrument))", - base_column=['payment_instrument_type', 'payment_instrument']) - .withColumn("customer_notes", text=dg.ILText(words=(1,8))) - .withColumn("created_ts", "timestamp", expr="now()") - .withColumn("modified_ts", "timestamp", expr="now()") - .withColumn("memo", expr="'original data'") - ) + dataspec = ( + dg.DataGenerator(spark, rows=data_rows, partitions=partitions_requested) + .withColumn("customer_id","long", uniqueValues=uniqueCustomers) + .withColumn("name", percentNulls=0.01, template=r'\\w \\w|\\w a. \\w') + .withColumn("alias", percentNulls=0.01, template=r'\\w \\w|\\w a. \\w') + .withColumn("payment_instrument_type", values=['paypal', 'Visa', 'Mastercard', + 'American Express', 'discover', 'branded visa', 'branded mastercard'], + random=True, distribution="normal") + .withColumn("int_payment_instrument", "int", minValue=0000, maxValue=9999, + baseColumn="customer_id", baseColumnType="hash", omit=True) + .withColumn("payment_instrument", + expr="format_number(int_payment_instrument, '**** ****** *####')", + baseColumn="int_payment_instrument") + .withColumn("email", template=r'\\w.\\w@\\w.com|\\w-\\w@\\w') + .withColumn("email2", template=r'\\w.\\w@\\w.com') + .withColumn("ip_address", template=r'\\n.\\n.\\n.\\n') + .withColumn("md5_payment_instrument", + expr="md5(concat(payment_instrument_type, ':', payment_instrument))", + base_column=['payment_instrument_type', 'payment_instrument']) + .withColumn("customer_notes", text=dg.ILText(words=(1,8))) + .withColumn("created_ts", "timestamp", expr="now()") + .withColumn("modified_ts", "timestamp", expr="now()") + .withColumn("memo", expr="'original data'") + ) df1 = dataspec.build() # write table @@ -168,7 +170,6 @@ values of the columns from the source table will be used. ]) print(sqlStmt) - spark.sql(sqlStmt) That's all that's required to perform merges with the data generation framework. diff --git a/docs/source/generating_json_data.rst b/docs/source/generating_json_data.rst index 56989d76..a26db3ee 100644 --- a/docs/source/generating_json_data.rst +++ b/docs/source/generating_json_data.rst @@ -195,51 +195,52 @@ functions such as `named_struct` and `to_json`. lines = ['delta', 'xyzzy', 'lakehouse', 'gadget', 'droid'] - testDataSpec = (dg.DataGenerator(spark, name="device_data_set", rows=1000000, - partitions=8, - randomSeedMethod='hash_fieldname') - .withIdOutput() - # we'll use hash of the base field to generate the ids to - # avoid a simple incrementing sequence - .withColumn("internal_device_id", LongType(), minValue=0x1000000000000, - uniqueValues=device_population, omit=True, baseColumnType="hash") - - # note for format strings, we must use "%lx" not "%x" as the - # underlying value is a long - .withColumn("device_id", StringType(), format="0x%013x", - baseColumn="internal_device_id") - - # the device / user attributes will be the same for the same device id - # so lets use the internal device id as the base column for these attribute - .withColumn("country", StringType(), values=country_codes, - weights=country_weights, - baseColumn="internal_device_id") - - .withColumn("manufacturer", StringType(), values=manufacturers, - baseColumn="internal_device_id", omit=True) - .withColumn("line", StringType(), values=lines, baseColumn="manufacturer", - baseColumnType="hash", omit=True) - .withColumn("manufacturer_info", "string", - expr="to_json(named_struct('line', line, 'manufacturer', manufacturer))", - baseColumn=['manufacturer', 'line']) - - - .withColumn("model_ser", IntegerType(), minValue=1, maxValue=11, - baseColumn="device_id", - baseColumnType="hash", omit=True) - - .withColumn("event_type", StringType(), - values=["activation", "deactivation", "plan change", - "telecoms activity", "internet activity", "device error"], - random=True, omit=True) - .withColumn("event_ts", "timestamp", - begin="2020-01-01 01:00:00", end="2020-12-31 23:59:00", - interval="1 minute", random=True, omit=True) - - .withColumn("event_info", "string", - expr="to_json(named_struct('event_type', event_type, 'event_ts', event_ts))", - baseColumn=['event_type', 'event_ts']) - ) + testDataSpec = ( + dg.DataGenerator(spark, name="device_data_set", rows=1000000, + partitions=8, + randomSeedMethod='hash_fieldname') + .withIdOutput() + # we'll use hash of the base field to generate the ids to + # avoid a simple incrementing sequence + .withColumn("internal_device_id", LongType(), minValue=0x1000000000000, + uniqueValues=device_population, omit=True, baseColumnType="hash") + + # note for format strings, we must use "%lx" not "%x" as the + # underlying value is a long + .withColumn("device_id", StringType(), format="0x%013x", + baseColumn="internal_device_id") + + # the device / user attributes will be the same for the same device id + # so lets use the internal device id as the base column for these attribute + .withColumn("country", StringType(), values=country_codes, + weights=country_weights, + baseColumn="internal_device_id") + + .withColumn("manufacturer", StringType(), values=manufacturers, + baseColumn="internal_device_id", omit=True) + .withColumn("line", StringType(), values=lines, baseColumn="manufacturer", + baseColumnType="hash", omit=True) + .withColumn("manufacturer_info", "string", + expr="to_json(named_struct('line', line, 'manufacturer', manufacturer))", + baseColumn=['manufacturer', 'line']) + + + .withColumn("model_ser", IntegerType(), minValue=1, maxValue=11, + baseColumn="device_id", + baseColumnType="hash", omit=True) + + .withColumn("event_type", StringType(), + values=["activation", "deactivation", "plan change", + "telecoms activity", "internet activity", "device error"], + random=True, omit=True) + .withColumn("event_ts", "timestamp", + begin="2020-01-01 01:00:00", end="2020-12-31 23:59:00", + interval="1 minute", random=True, omit=True) + + .withColumn("event_info", "string", + expr="to_json(named_struct('event_type', event_type, 'event_ts', event_ts))", + baseColumn=['event_type', 'event_ts']) + ) dfTestData = testDataSpec.build() diff --git a/docs/source/multi_table_data.rst b/docs/source/multi_table_data.rst index 5eab313d..d5aa5ec2 100644 --- a/docs/source/multi_table_data.rst +++ b/docs/source/multi_table_data.rst @@ -1,7 +1,7 @@ .. Test Data Generator documentation master file, created by - sphinx-quickstart on Sun Jun 21 10:54:30 2020. - You can adapt this file completely to your liking, but it should at least - contain the root `toctree` directive. +sphinx-quickstart on Sun Jun 21 10:54:30 2020. +You can adapt this file completely to your liking, but it should at least +contain the root `toctree` directive. Generating and Using Data with Multiple Tables ============================================== @@ -73,7 +73,9 @@ Here we use a simple sequence for our plan ids. import dbldatagen as dg import pyspark.sql.functions as F - spark.catalog.clearCache() # clear cache so that if we run multiple times to check performance, we're not relying on cache + # clear cache so that if we run multiple times to check performance, + # we're not relying on cache + spark.catalog.clearCache() UNIQUE_PLANS = 20 PLAN_MIN_VALUE = 100 @@ -87,36 +89,35 @@ Here we use a simple sequence for our plan ids. spark.conf.set("spark.sql.execution.arrow.maxRecordsPerBatch", 20000) - plan_dataspec = (dg.DataGenerator(spark, rows=data_rows, partitions=partitions_requested) - .withColumn("plan_id","int", minValue=PLAN_MIN_VALUE, uniqueValues=UNIQUE_PLANS) - # use plan_id as root value - .withColumn("plan_name", prefix="plan", baseColumn="plan_id") - - # note default step is 1 so you must specify a step for small number ranges, - .withColumn("cost_per_mb", "decimal(5,3)", minValue=0.005, maxValue=0.050, - step=0.005, random=True) - .withColumn("cost_per_message", "decimal(5,3)", minValue=0.001, maxValue=0.02, - step=0.001, random=True) - .withColumn("cost_per_minute", "decimal(5,3)", minValue=0.001, maxValue=0.01, - step=0.001, random=True) - - # we're modelling long distance and international prices simplistically - - # each is a multiplier thats applied to base rate - .withColumn("ld_multiplier", "decimal(5,3)", minValue=1.5, maxValue=3, step=0.05, - random=True, distribution="normal", omit=True) - .withColumn("ld_cost_per_minute", "decimal(5,3)", - expr="cost_per_minute * ld_multiplier", - baseColumns=['cost_per_minute', 'ld_multiplier']) - .withColumn("intl_multiplier", "decimal(5,3)", minValue=2, maxValue=4, step=0.05, - random=True, distribution="normal", omit=True) - .withColumn("intl_cost_per_minute", "decimal(5,3)", - expr="cost_per_minute * intl_multiplier", - baseColumns=['cost_per_minute', 'intl_multiplier']) + plan_dataspec = ( + dg.DataGenerator(spark, rows=data_rows, partitions=partitions_requested) + .withColumn("plan_id","int", minValue=PLAN_MIN_VALUE, uniqueValues=UNIQUE_PLANS) + # use plan_id as root value + .withColumn("plan_name", prefix="plan", baseColumn="plan_id") + + # note default step is 1 so you must specify a step for small number ranges, + .withColumn("cost_per_mb", "decimal(5,3)", minValue=0.005, maxValue=0.050, + step=0.005, random=True) + .withColumn("cost_per_message", "decimal(5,3)", minValue=0.001, maxValue=0.02, + step=0.001, random=True) + .withColumn("cost_per_minute", "decimal(5,3)", minValue=0.001, maxValue=0.01, + step=0.001, random=True) + + # we're modelling long distance and international prices simplistically - + # each is a multiplier thats applied to base rate + .withColumn("ld_multiplier", "decimal(5,3)", minValue=1.5, maxValue=3, step=0.05, + random=True, distribution="normal", omit=True) + .withColumn("ld_cost_per_minute", "decimal(5,3)", + expr="cost_per_minute * ld_multiplier", + baseColumns=['cost_per_minute', 'ld_multiplier']) + .withColumn("intl_multiplier", "decimal(5,3)", minValue=2, maxValue=4, step=0.05, + random=True, distribution="normal", omit=True) + .withColumn("intl_cost_per_minute", "decimal(5,3)", + expr="cost_per_minute * intl_multiplier", + baseColumns=['cost_per_minute', 'intl_multiplier']) ) - df_plans = (plan_dataspec.build() - .cache() - ) + df_plans = plan_dataspec.build().cache() display(df_plans) @@ -195,10 +196,11 @@ when using hashed values, the range of the hashes produced can be large. effective_customers = df_customers.count() - print(stripMargin(f"""revised customers : {df_customers.count()}, - | unique customers: {df_customers.select(F.countDistinct('customer_id')).take(1)[0][0]}, - | unique device ids: {df_customers.select(F.countDistinct('device_id')).take(1)[0][0]}, - | unique phone numbers: {df_customers.select(F.countDistinct('phone_number')).take(1)[0][0]}""") + print(stripMargin( + f"""revised customers : {df_customers.count()}, + | unique customers: {df_customers.select(F.countDistinct('customer_id')).take(1)[0][0]}, + | unique device ids: {df_customers.select(F.countDistinct('device_id')).take(1)[0][0]}, + | unique phone numbers: {df_customers.select(F.countDistinct('phone_number')).take(1)[0][0]}""") ) display(df_customers) @@ -247,7 +249,8 @@ A simple approach is simply to multiply the # use random seed method of 'hash_fieldname' for better spread - default in later builds events_dataspec = (dg.DataGenerator(spark, rows=data_rows, partitions=partitions_requested, randomSeed=42, randomSeedMethod="hash_fieldname") - # use same logic as per customers dataset to ensure matching keys - but make them random + # use same logic as per customers dataset to ensure matching keys + # but make them random .withColumn("device_id_base","decimal(10)", minValue=CUSTOMER_MIN_VALUE, uniqueValues=UNIQUE_CUSTOMERS, random=True, omit=True) @@ -260,12 +263,14 @@ A simple approach is simply to multiply the weights=[50, 50, 20, 10, 5 ], random=True) # use Gamma distribution for skew towards short calls - .withColumn("base_minutes","decimal(7,2)", minValue=1.0, maxValue=100.0, step=0.1, + .withColumn("base_minutes","decimal(7,2)", + minValue=1.0, maxValue=100.0, step=0.1, distribution=dg.distributions.Gamma(shape=1.5, scale=2.0), random=True, omit=True) # use Gamma distribution for skew towards short transfers - .withColumn("base_bytes_transferred","decimal(12)", minValue=K_1, maxValue=MB_100, + .withColumn("base_bytes_transferred","decimal(12)", + minValue=K_1, maxValue=MB_100, distribution=dg.distributions.Gamma(shape=0.75, scale=2.0), random=True, omit=True) @@ -308,8 +313,7 @@ Let's compute the customers and associated plans import pyspark.sql.functions as F import pyspark.sql.types as T - df_customer_pricing = df_customers.join(df_plans, - df_plans.plan_id == df_customers.plan) + df_customer_pricing = df_customers.join(df_plans, df_plans.plan_id == df_customers.plan) display(df_customer_pricing) @@ -365,8 +369,9 @@ now let's compute the invoices .. code-block:: python - df_customer_summary = (df_customer_pricing.join(df_summary, - df_customer_pricing.device_id == df_summary.device_id ) + df_customer_summary = ( + df_customer_pricing.join(df_summary, + df_customer_pricing.device_id == df_summary.device_id ) .createOrReplaceTempView("customer_summary")) df_invoices = spark.sql(""" diff --git a/docs/source/troubleshooting.rst b/docs/source/troubleshooting.rst index 386d35ab..83d721b3 100644 --- a/docs/source/troubleshooting.rst +++ b/docs/source/troubleshooting.rst @@ -165,50 +165,55 @@ In these cases, we use the `baseColumn` attribute to ensure the correct column b lines = ['delta', 'xyzzy', 'lakehouse', 'gadget', 'droid'] - testDataSpec = (dg.DataGenerator(spark, name="device_data_set", rows=1000000, - partitions=8, - randomSeedMethod='hash_fieldname') - # we'll use hash of the base field to generate the ids to - # avoid a simple incrementing sequence - .withColumn("internal_device_id", "long", minValue=0x1000000000000, - uniqueValues=device_population, omit=True, baseColumnType="hash") - - # note for format strings, we must use "%lx" not "%x" as the - # underlying value is a long - .withColumn("device_id", "string", format="0x%013x", - baseColumn="internal_device_id") - - # the device / user attributes will be the same for the same device id - # so lets use the internal device id as the base column for these attribute - .withColumn("country", "string", values=country_codes, - weights=country_weights, - baseColumn="internal_device_id") - - .withColumn("manufacturer", "string", values=manufacturers, - baseColumn="internal_device_id", omit=True) - - .withColumn("line", StringType(), values=lines, baseColumn="manufacturer", - baseColumnType="hash", omit=True) - - # note use of baseColumn to control column build ordering - .withColumn("manufacturer_info", "string", - expr="to_json(named_struct('line', line, 'manufacturer', manufacturer))", - baseColumn=["line", "manufacturer"] - ) - - .withColumn("event_type", "string", - values=["activation", "deactivation", "plan change", - "telecoms activity", "internet activity", "device error"], - random=True, omit=True) - - .withColumn("event_ts", "timestamp", begin="2020-01-01 01:00:00", end="2020-12-31 23:59:00", - interval="1 minute", random=True, omit=True) - - # note use of baseColumn to control column build ordering - .withColumn("event_info", "string", - expr="to_json(named_struct('event_type', event_type, 'event_ts', event_ts))", - baseColumn=["event_type", "event_ts"]) - ) + testDataSpec = ( + dg.DataGenerator(spark, name="device_data_set", rows=1000000, + partitions=8, + randomSeedMethod='hash_fieldname') + # we'll use hash of the base field to generate the ids to + # avoid a simple incrementing sequence + .withColumn("internal_device_id", "long", minValue=0x1000000000000, + uniqueValues=device_population, omit=True, baseColumnType="hash") + + # note for format strings, we must use "%lx" not "%x" as the + # underlying value is a long + .withColumn("device_id", "string", format="0x%013x", + baseColumn="internal_device_id") + + # the device / user attributes will be the same for the same device id + # so lets use the internal device id as the base column for these attribute + .withColumn("country", "string", values=country_codes, + weights=country_weights, + baseColumn="internal_device_id") + + .withColumn("manufacturer", "string", values=manufacturers, + baseColumn="internal_device_id", omit=True) + + .withColumn("line", StringType(), values=lines, baseColumn="manufacturer", + baseColumnType="hash", omit=True) + + # note use of baseColumn to control column build ordering + .withColumn("manufacturer_info", "string", + expr="to_json(named_struct('line', line, 'manufacturer', manufacturer))", + baseColumn=["line", "manufacturer"] + ) + + .withColumn("event_type", "string", + values=["activation", "deactivation", "plan change", + "telecoms activity", "internet activity", "device error"], + random=True, omit=True) + + .withColumn("event_ts", "timestamp", + begin="2020-01-01 01:00:00", + end="2020-12-31 23:59:00", + interval="1 minute", + random=True, + omit=True) + + # note use of baseColumn to control column build ordering + .withColumn("event_info", "string", + expr="to_json(named_struct('event_type', event_type, 'event_ts', event_ts))", + baseColumn=["event_type", "event_ts"]) + ) dfTestData = testDataSpec.build() From c4fdc3bc8e77a7eeaaa2ff03797cf160675cb79b Mon Sep 17 00:00:00 2001 From: ronanstokes-db Date: Tue, 9 May 2023 11:59:45 -0700 Subject: [PATCH 02/20] wip --- python/dev_require.txt | 1 - python/require.txt | 1 - 2 files changed, 2 deletions(-) diff --git a/python/dev_require.txt b/python/dev_require.txt index 8d53d810..a34ed3b2 100644 --- a/python/dev_require.txt +++ b/python/dev_require.txt @@ -31,7 +31,6 @@ pypandoc ipython==7.22.0 recommonmark sphinx-markdown-builder -rst2pdf==0.98 Jinja2 < 3.1 sphinx-copybutton diff --git a/python/require.txt b/python/require.txt index 53f80fde..5f0e30a4 100644 --- a/python/require.txt +++ b/python/require.txt @@ -30,7 +30,6 @@ pypandoc ipython==7.22.0 recommonmark sphinx-markdown-builder -rst2pdf==0.98 Jinja2 < 3.1 sphinx-copybutton From 46969e7647be3fa2641ab69bf2dcea21727f6920 Mon Sep 17 00:00:00 2001 From: ronanstokes-db Date: Tue, 9 May 2023 12:07:07 -0700 Subject: [PATCH 03/20] wip --- .github/workflows/push.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/push.yml b/.github/workflows/push.yml index 7bc7bc88..fbdfa548 100644 --- a/.github/workflows/push.yml +++ b/.github/workflows/push.yml @@ -31,7 +31,7 @@ jobs: - name: Set up Python 3.8 uses: actions/setup-python@v3 with: - python-version: '3.8' + python-version: '3.8.10' cache: 'pipenv' - name: Install From cf9e909e555eb218b772c0a219748595f4b16c3e Mon Sep 17 00:00:00 2001 From: ronanstokes-db Date: Tue, 9 May 2023 12:10:53 -0700 Subject: [PATCH 04/20] wip --- .github/workflows/push.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/push.yml b/.github/workflows/push.yml index fbdfa548..8192e3a9 100644 --- a/.github/workflows/push.yml +++ b/.github/workflows/push.yml @@ -29,7 +29,7 @@ jobs: # ${{ runner.os }}-go- - name: Set up Python 3.8 - uses: actions/setup-python@v3 + uses: actions/setup-python@v4 with: python-version: '3.8.10' cache: 'pipenv' From ba929ac892c79b8fe1f5469d05ec2f2bd1f38578 Mon Sep 17 00:00:00 2001 From: ronanstokes-db Date: Tue, 9 May 2023 12:13:06 -0700 Subject: [PATCH 05/20] wip --- .github/workflows/push.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/push.yml b/.github/workflows/push.yml index 8192e3a9..5e9a736b 100644 --- a/.github/workflows/push.yml +++ b/.github/workflows/push.yml @@ -31,7 +31,7 @@ jobs: - name: Set up Python 3.8 uses: actions/setup-python@v4 with: - python-version: '3.8.10' + python-version: '3.8.12' cache: 'pipenv' - name: Install From 03245c392b71eb42b538e6489d97e99d68b8ac29 Mon Sep 17 00:00:00 2001 From: ronanstokes-db Date: Tue, 9 May 2023 12:15:05 -0700 Subject: [PATCH 06/20] wip --- .github/workflows/push.yml | 3 +++ 1 file changed, 3 insertions(+) diff --git a/.github/workflows/push.yml b/.github/workflows/push.yml index 5e9a736b..1a41f6b2 100644 --- a/.github/workflows/push.yml +++ b/.github/workflows/push.yml @@ -34,6 +34,9 @@ jobs: python-version: '3.8.12' cache: 'pipenv' + - name: Install pip + run: python -m pip install --upgrade pip + - name: Install run: pip install pipenv From 4b94465526facd93cd93933e7ae1ebedd51ff879 Mon Sep 17 00:00:00 2001 From: ronanstokes-db Date: Tue, 9 May 2023 12:17:24 -0700 Subject: [PATCH 07/20] wip --- .github/workflows/push.yml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/push.yml b/.github/workflows/push.yml index 1a41f6b2..de6893f4 100644 --- a/.github/workflows/push.yml +++ b/.github/workflows/push.yml @@ -28,15 +28,15 @@ jobs: # restore-keys: | # ${{ runner.os }}-go- - - name: Set up Python 3.8 + - name: Set up Python 3.9 uses: actions/setup-python@v4 with: - python-version: '3.8.12' + python-version: '3.9' cache: 'pipenv' - name: Install pip run: python -m pip install --upgrade pip - + - name: Install run: pip install pipenv From ff1574c359908d67a2e7a2b196ce1db1b2d40225 Mon Sep 17 00:00:00 2001 From: ronanstokes-db Date: Tue, 9 May 2023 12:20:11 -0700 Subject: [PATCH 08/20] wip --- .github/workflows/push.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/push.yml b/.github/workflows/push.yml index de6893f4..699bb31e 100644 --- a/.github/workflows/push.yml +++ b/.github/workflows/push.yml @@ -8,11 +8,11 @@ on: jobs: tests: - # Ubuntu latest no longer installs Python 3.8 by default so install it + # Ubuntu latest no longer installs Python 3.9 by default so install it runs-on: ubuntu-latest steps: - name: Checkout - uses: actions/checkout@v2 + uses: actions/checkout@v3 - name: Unshallow run: git fetch --prune --unshallow From 97616edd58265b96b996d5c258c2776434472cf7 Mon Sep 17 00:00:00 2001 From: ronanstokes-db Date: Tue, 9 May 2023 12:25:19 -0700 Subject: [PATCH 09/20] wip --- .github/workflows/push.yml | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/.github/workflows/push.yml b/.github/workflows/push.yml index 699bb31e..3d84fe78 100644 --- a/.github/workflows/push.yml +++ b/.github/workflows/push.yml @@ -13,6 +13,8 @@ jobs: steps: - name: Checkout uses: actions/checkout@v3 + with: + fetch-depth: 0 - name: Unshallow run: git fetch --prune --unshallow @@ -34,6 +36,9 @@ jobs: python-version: '3.9' cache: 'pipenv' + - name: Check Python version + run: python --version + - name: Install pip run: python -m pip install --upgrade pip From 064ab0ae28488f152e059c3ac2a1ec55ea79b85d Mon Sep 17 00:00:00 2001 From: ronanstokes-db Date: Tue, 9 May 2023 12:26:36 -0700 Subject: [PATCH 10/20] wip --- .github/workflows/push.yml | 3 --- 1 file changed, 3 deletions(-) diff --git a/.github/workflows/push.yml b/.github/workflows/push.yml index 3d84fe78..69a2802a 100644 --- a/.github/workflows/push.yml +++ b/.github/workflows/push.yml @@ -16,9 +16,6 @@ jobs: with: fetch-depth: 0 - - name: Unshallow - run: git fetch --prune --unshallow - # - name: Cache packages # uses: actions/cache@v2 # with: From 613bf9c2155cbb6299c60b7df0ea14912a222038 Mon Sep 17 00:00:00 2001 From: ronanstokes-db Date: Tue, 9 May 2023 12:31:35 -0700 Subject: [PATCH 11/20] wip --- .github/workflows/push.yml | 4 ++-- Pipfile | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/push.yml b/.github/workflows/push.yml index 69a2802a..5e9768c4 100644 --- a/.github/workflows/push.yml +++ b/.github/workflows/push.yml @@ -27,10 +27,10 @@ jobs: # restore-keys: | # ${{ runner.os }}-go- - - name: Set up Python 3.9 + - name: Set up Python 3.8 uses: actions/setup-python@v4 with: - python-version: '3.9' + python-version: '3.8.12' cache: 'pipenv' - name: Check Python version diff --git a/Pipfile b/Pipfile index e6c00c15..0ac73774 100644 --- a/Pipfile +++ b/Pipfile @@ -28,4 +28,4 @@ pyparsing = "==2.4.7" jmespath = "==0.10.0" [requires] -python_version = ">=3.8.10" +python_version = "==3.8.12" From 1b90eb367e3dccb347f8440afde59a13dc50485e Mon Sep 17 00:00:00 2001 From: ronanstokes-db Date: Tue, 9 May 2023 12:34:21 -0700 Subject: [PATCH 12/20] wip --- Pipfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Pipfile b/Pipfile index 0ac73774..ee4d7609 100644 --- a/Pipfile +++ b/Pipfile @@ -28,4 +28,4 @@ pyparsing = "==2.4.7" jmespath = "==0.10.0" [requires] -python_version = "==3.8.12" +python_version = "3.8.12" From 65f4d54e35ac88bce293fd1aeada5b400fae1b4c Mon Sep 17 00:00:00 2001 From: ronanstokes-db Date: Tue, 9 May 2023 12:53:49 -0700 Subject: [PATCH 13/20] updated build version --- .github/workflows/onrelease.yml | 13 ++++++++----- CONTRIBUTING.md | 6 +++--- README.md | 2 +- dbldatagen/_version.py | 2 +- docs/source/conf.py | 2 +- python/.bumpversion.cfg | 2 +- setup.py | 2 +- 7 files changed, 16 insertions(+), 13 deletions(-) diff --git a/.github/workflows/onrelease.yml b/.github/workflows/onrelease.yml index 479a0cbf..3827a3d6 100644 --- a/.github/workflows/onrelease.yml +++ b/.github/workflows/onrelease.yml @@ -18,15 +18,18 @@ jobs: - name: Checkout uses: actions/checkout@v2 - - name: Unshallow - run: git fetch --prune --unshallow - - name: Set up Python 3.8 - uses: actions/setup-python@v3 + uses: actions/setup-python@v4 with: - python-version: '3.8' + python-version: '3.8.12' cache: 'pipenv' + - name: Check Python version + run: python --version + + - name: Install pip + run: python -m pip install --upgrade pip + - name: Install run: pip install pipenv diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 2d20cbd6..9402e2d4 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -19,10 +19,10 @@ Dependent packages are not installed automatically by the `dbldatagen` package. ## Python compatibility -The code has been tested with Python 3.8.10 and later. +The code has been tested with Python 3.8.12 and later. -Older releases were tested with Python 3.7.5 but as of this release, it requires the Databricks runtime 9.1 LTS or later -which relies on Python 3.8.10 +Older releases were tested with Python 3.7.5 but as of this release, it requires the Databricks +runtime 9.1 LTS or later. ## Checking your code for common issues diff --git a/README.md b/README.md index c15236a1..95df90ee 100644 --- a/README.md +++ b/README.md @@ -63,7 +63,7 @@ details of use and many examples. Release notes and details of the latest changes for this specific release can be found in the GitHub repository -[here](https://github.com/databrickslabs/dbldatagen/blob/release/v0.3.4post2/CHANGELOG.md) +[here](https://github.com/databrickslabs/dbldatagen/blob/release/v0.3.4post3/CHANGELOG.md) # Installation diff --git a/dbldatagen/_version.py b/dbldatagen/_version.py index d432695b..3c65fd7a 100644 --- a/dbldatagen/_version.py +++ b/dbldatagen/_version.py @@ -34,7 +34,7 @@ def get_version(version): return version_info -__version__ = "0.3.4post2" # DO NOT EDIT THIS DIRECTLY! It is managed by bumpversion +__version__ = "0.3.4post3" # DO NOT EDIT THIS DIRECTLY! It is managed by bumpversion __version_info__ = get_version(__version__) diff --git a/docs/source/conf.py b/docs/source/conf.py index 4ae0871e..80d91087 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -28,7 +28,7 @@ author = 'Databricks Inc' # The full version, including alpha/beta/rc tags -release = "0.3.4post2" # DO NOT EDIT THIS DIRECTLY! It is managed by bumpversion +release = "0.3.4post3" # DO NOT EDIT THIS DIRECTLY! It is managed by bumpversion # -- General configuration --------------------------------------------------- diff --git a/python/.bumpversion.cfg b/python/.bumpversion.cfg index 5c7c1e86..365a83e3 100644 --- a/python/.bumpversion.cfg +++ b/python/.bumpversion.cfg @@ -1,5 +1,5 @@ [bumpversion] -current_version = 0.3.4post2 +current_version = 0.3.4post3 commit = False tag = False parse = (?P\d+)\.(?P\d+)\.(?P\d+){0,1}(?P\D*)(?P\d*) diff --git a/setup.py b/setup.py index 34375fb3..91f3fb1c 100644 --- a/setup.py +++ b/setup.py @@ -31,7 +31,7 @@ setuptools.setup( name="dbldatagen", - version="0.3.4post2", + version="0.3.4post3", author="Ronan Stokes, Databricks", description="Databricks Labs - PySpark Synthetic Data Generator", long_description=long_description, From c81cc669a2091a09a71e7603573ed451303c6436 Mon Sep 17 00:00:00 2001 From: ronanstokes-db Date: Tue, 9 May 2023 12:56:05 -0700 Subject: [PATCH 14/20] updated build version --- CHANGELOG.md | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index bc48ca45..afb34eee 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -8,6 +8,10 @@ All notable changes to the Databricks Labs Data Generator will be documented in #### Changed * Added formatting of generated code as Html for script methods +### Version 0.3.4 Post 3 + +### Changed +* Build now uses Python 3.8.12. Updated build process to reflect that. ### Version 0.3.4 Post 2 From 794d88df521c09dcc9265933919443ed0aacce3d Mon Sep 17 00:00:00 2001 From: ronanstokes-db Date: Tue, 9 May 2023 14:20:35 -0700 Subject: [PATCH 15/20] wip --- dbldatagen/text_generators.py | 25 ++++++++++++++++++++++--- tests/test_text_generation.py | 2 ++ 2 files changed, 24 insertions(+), 3 deletions(-) diff --git a/dbldatagen/text_generators.py b/dbldatagen/text_generators.py index 965350be..9c3c6b0e 100644 --- a/dbldatagen/text_generators.py +++ b/dbldatagen/text_generators.py @@ -60,6 +60,12 @@ 'PROIDENT', 'SUNT', 'IN', 'CULPA', 'QUI', 'OFFICIA', 'DESERUNT', 'MOLLIT', 'ANIM', 'ID', 'EST', 'LABORUM'] +# spacing smbols - note that order is important +SPACE_SYMBOLS = ["\n\n", " ", ""] + +# default punctuation symbols +DEFAULT_PUNCTUATION = [". "] + class TextGenerator(object): """ Base class for text generation classes @@ -668,10 +674,16 @@ class ILText(TextGenerator): # lgtm [py/missing-equals] :param paragraphs: Number of paragraphs to generate. If tuple will generate random number in range :param sentences: Number of sentences to generate. If tuple will generate random number in tuple range :param words: Number of words per sentence to generate. If tuple, will generate random number in tuple range + :param punctuation: List of strings or single string of punctuation to use + + If `punctuation` is not specified, the following punctuation will be used: [". "] + + Note that punctuation will only be used at end of each logical sentence, not at end of each word. + """ - def __init__(self, paragraphs=None, sentences=None, words=None, extendedWordList=None): + def __init__(self, paragraphs=None, sentences=None, words=None, extendedWordList=None, punctuation=None): """ Initialize the ILText with text generation parameters """ @@ -685,6 +697,7 @@ def __init__(self, paragraphs=None, sentences=None, words=None, extendedWordList self.sentences = self.getAsTupleOrElse(sentences, (1, 1), "sentences") self.wordList = extendedWordList if extendedWordList is not None else _WORDS_LOWER self.shape = [self.paragraphs[1], self.sentences[1], self.words[1]] + self.punctuation = np.array(punctuation) if punctuation is not None else np.array(DEFAULT_PUNCTUATION) # values needed for the text generation # numpy uses fixed sizes for strings , so compute whats needed @@ -717,11 +730,15 @@ def _processWordList(self): self._wordOffsetSize = all_words.size self._sentenceEndOffset = all_words.size - self._paragraphEnd = self._sentenceEndOffset + 1 + + punctuation = [] + punctuation.extend(self.punctuation) + punctuation.extend(SPACE_SYMBOLS) + + self._paragraphEnd = self._sentenceEndOffset + len(self.punctuation) self._wordSpaceOffset = self._paragraphEnd + 1 self._emptyStringOffset = self._wordSpaceOffset + 1 - punctuation = [". ", "\n\n", " ", ""] all_words = np.concatenate((all_words, punctuation)) self._startOfCapitalsOffset = all_words.size @@ -820,6 +837,8 @@ def generateText(self, baseValues, rowCount=1): new_col = new_word_offsets[:, :, :, np.newaxis] terminated_word_offsets = np.ma.concatenate((masked_offsets, new_col), axis=3) new_column = terminated_word_offsets[:, :, :, -1] + + # TODO: modify to add punctuation to end of sentences new_column[~new_column.mask] = self._sentenceEndOffset # reshape to paragraphs diff --git a/tests/test_text_generation.py b/tests/test_text_generation.py index fb23d9d3..374449cc 100644 --- a/tests/test_text_generation.py +++ b/tests/test_text_generation.py @@ -272,6 +272,8 @@ def test_small_ILText_driven_data_generation(self): df_iltext_data = testDataSpec.build() + df_iltext_data.show() + counts = df_iltext_data.agg( F.countDistinct("paras").alias("paragraphs_count") ).collect()[0] From 9ff9cfef510d366bb75cb0be230844cfad601629 Mon Sep 17 00:00:00 2001 From: ronanstokes-db Date: Tue, 9 May 2023 14:27:18 -0700 Subject: [PATCH 16/20] wip --- dbldatagen/text_generatestring.py | 0 1 file changed, 0 insertions(+), 0 deletions(-) create mode 100644 dbldatagen/text_generatestring.py diff --git a/dbldatagen/text_generatestring.py b/dbldatagen/text_generatestring.py new file mode 100644 index 00000000..e69de29b From 61115601f0b3fb0915812f5d640fb1e9948d11ee Mon Sep 17 00:00:00 2001 From: ronanstokes-db Date: Tue, 9 May 2023 14:32:15 -0700 Subject: [PATCH 17/20] wip --- dbldatagen/__init__.py | 3 ++- dbldatagen/column_generation_spec.py | 2 +- dbldatagen/text_generators.py | 8 ++++++++ 3 files changed, 11 insertions(+), 2 deletions(-) diff --git a/dbldatagen/__init__.py b/dbldatagen/__init__.py index 49eea723..69956112 100644 --- a/dbldatagen/__init__.py +++ b/dbldatagen/__init__.py @@ -41,12 +41,13 @@ from .spark_singleton import SparkSingleton from .text_generators import TemplateGenerator, ILText, TextGenerator from .text_generator_plugins import PyfuncText, PyfuncTextFactory, FakerTextFactory, fakerText +from .text_generatestring import GenerateString from .html_utils import HtmlUtils __all__ = ["data_generator", "data_analyzer", "schema_parser", "daterange", "nrange", "column_generation_spec", "utils", "function_builder", "spark_singleton", "text_generators", "datarange", "datagen_constants", - "text_generator_plugins", "html_utils" + "text_generator_plugins", "html_utils", "text_generatestring" ] diff --git a/dbldatagen/column_generation_spec.py b/dbldatagen/column_generation_spec.py index 6456a389..e77ca977 100644 --- a/dbldatagen/column_generation_spec.py +++ b/dbldatagen/column_generation_spec.py @@ -1107,7 +1107,7 @@ def _applyPrefixSuffixExpressions(self, cprefix, csuffix, new_def): new_def = concat(new_def.astype(IntegerType()), lit(text_separator), lit(csuffix)) return new_def - def _applyTextGenerationExpression(self, new_def, use_pandas_optimizations): + def _applyTextGenerationExpression(self, new_def, use_pandas_optimizations=True): """Apply text generation expression to column expression :param new_def : column definition being created diff --git a/dbldatagen/text_generators.py b/dbldatagen/text_generators.py index 9c3c6b0e..b76a36fd 100644 --- a/dbldatagen/text_generators.py +++ b/dbldatagen/text_generators.py @@ -167,6 +167,14 @@ def getAsTupleOrElse(v, defaultValue, valueName): return defaultValue + def prepareBaseValue(self, baseDef): + """ Prepare the base value for processing + :param baseDef: base value expression + :return: base value expression unchanged + Derived classes are expected to override this if needed + """ + return baseDef + class TemplateGenerator(TextGenerator): # lgtm [py/missing-equals] """This class handles the generation of text from templates From 16284e0cb3da189418963b3155cb10f841d34094 Mon Sep 17 00:00:00 2001 From: ronanstokes-db Date: Thu, 1 Jun 2023 01:37:22 -0700 Subject: [PATCH 18/20] added GenerateStr and html output to data analyzer --- CHANGELOG.md | 1 + dbldatagen/__init__.py | 2 +- dbldatagen/column_generation_spec.py | 3 + dbldatagen/data_analyzer.py | 45 ++++--- dbldatagen/text_generatestring.py | 181 +++++++++++++++++++++++++++ dbldatagen/text_generators.py | 27 +--- docs/utils/mk_quick_index.py | 2 + tests/test_text_generatestring.py | 99 +++++++++++++++ 8 files changed, 323 insertions(+), 37 deletions(-) create mode 100644 tests/test_text_generatestring.py diff --git a/CHANGELOG.md b/CHANGELOG.md index afb34eee..d55f6401 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,7 @@ All notable changes to the Databricks Labs Data Generator will be documented in #### Changed * Added formatting of generated code as Html for script methods +* Added text generator `GenerateString` ### Version 0.3.4 Post 3 diff --git a/dbldatagen/__init__.py b/dbldatagen/__init__.py index 69956112..101b3f84 100644 --- a/dbldatagen/__init__.py +++ b/dbldatagen/__init__.py @@ -47,7 +47,7 @@ __all__ = ["data_generator", "data_analyzer", "schema_parser", "daterange", "nrange", "column_generation_spec", "utils", "function_builder", "spark_singleton", "text_generators", "datarange", "datagen_constants", - "text_generator_plugins", "html_utils", "text_generatestring" + "text_generator_plugins", "html_utils", "text_generatestring", "value_based_prng" ] diff --git a/dbldatagen/column_generation_spec.py b/dbldatagen/column_generation_spec.py index e77ca977..da9c99f1 100644 --- a/dbldatagen/column_generation_spec.py +++ b/dbldatagen/column_generation_spec.py @@ -1118,6 +1118,9 @@ def _applyTextGenerationExpression(self, new_def, use_pandas_optimizations=True) # while it seems like this could use a shared instance, this does not work if initialized # in a class method tg = self.textGenerator + + new_def = tg.prepareBaseValue(new_def) + if use_pandas_optimizations: self.executionHistory.append(f".. text generation via pandas scalar udf `{tg}`") u_value_from_generator = pandas_udf(tg.pandasGenerateText, diff --git a/dbldatagen/data_analyzer.py b/dbldatagen/data_analyzer.py index 5aec5245..ecacd95c 100644 --- a/dbldatagen/data_analyzer.py +++ b/dbldatagen/data_analyzer.py @@ -14,6 +14,7 @@ import pyspark.sql.functions as F from .utils import strip_margins +from .html_utils import HtmlUtils from .spark_singleton import SparkSingleton @@ -359,7 +360,7 @@ def _scriptDataGeneratorCode(cls, schema, dataSummary=None, sourceDf=None, suppr return "\n".join(stmts) @classmethod - def scriptDataGeneratorFromSchema(cls, schema, suppressOutput=False, name=None): + def scriptDataGeneratorFromSchema(cls, schema, suppressOutput=False, name=None, asHtml=False): """ Generate outline data generator code from an existing dataframe @@ -373,16 +374,24 @@ def scriptDataGeneratorFromSchema(cls, schema, suppressOutput=False, name=None): The dataframe to be analyzed is the dataframe passed to the constructor of the DataAnalyzer object. :param schema: Pyspark schema - i.e manually constructed StructType or return value from `dataframe.schema` - :param suppressOutput: Suppress printing of generated code if True + :param suppressOutput: Suppress printing of generated code if True. If asHtml is True, output is suppressed :param name: Optional name for data generator - :return: String containing skeleton code + :param asHtml: If True, will generate Html suitable for notebook ``displayHtml``. + :return: String containing skeleton code (in Html form if `asHtml` is True) """ - return cls._scriptDataGeneratorCode(schema, - suppressOutput=suppressOutput, - name=name) + omit_output_printing = suppressOutput or asHtml + + generated_code = cls._scriptDataGeneratorCode(schema, + suppressOutput=omit_output_printing, + name=name) + + if asHtml: + generated_code = HtmlUtils.formatCodeAsHtml(generated_code) + + return generated_code - def scriptDataGeneratorFromData(self, suppressOutput=False, name=None): + def scriptDataGeneratorFromData(self, suppressOutput=False, name=None, asHtml=False): """ Generate outline data generator code from an existing dataframe @@ -395,14 +404,17 @@ def scriptDataGeneratorFromData(self, suppressOutput=False, name=None): The dataframe to be analyzed is the Spark dataframe passed to the constructor of the DataAnalyzer object - :param suppressOutput: Suppress printing of generated code if True + :param suppressOutput: Suppress printing of generated code if True. If asHtml is True, output is suppressed :param name: Optional name for data generator - :return: String containing skeleton code + :param asHtml: If True, will generate Html suitable for notebook ``displayHtml``. + :return: String containing skeleton code (in Html form if `asHtml` is True) """ assert self._df is not None assert type(self._df) is ssql.DataFrame, "sourceDf must be a valid Pyspark dataframe" + omit_output_printing = suppressOutput or asHtml + if self._dataSummary is None: df_summary = self.summarizeToDF() @@ -411,8 +423,13 @@ def scriptDataGeneratorFromData(self, suppressOutput=False, name=None): row_key_pairs = row.asDict() self._dataSummary[row['measure_']] = row_key_pairs - return self._scriptDataGeneratorCode(self._df.schema, - suppressOutput=suppressOutput, - name=name, - dataSummary=self._dataSummary, - sourceDf=self._df) + generated_code = self._scriptDataGeneratorCode(self._df.schema, + suppressOutput=omit_output_printing, + name=name, + dataSummary=self._dataSummary, + sourceDf=self._df) + + if asHtml: + generated_code = HtmlUtils.formatCodeAsHtml(generated_code) + + return generated_code diff --git a/dbldatagen/text_generatestring.py b/dbldatagen/text_generatestring.py index e69de29b..fdb561bd 100644 --- a/dbldatagen/text_generatestring.py +++ b/dbldatagen/text_generatestring.py @@ -0,0 +1,181 @@ +# See the License for the specific language governing permissions and +# limitations under the License. +# + +""" +This file defines the RandomStr text generator +""" + +import math +import random + +import numpy as np +import pandas as pd + +import pyspark.sql.functions as F + +from .text_generators import TextGenerator +from .text_generators import _DIGITS_ZERO, _LETTERS_UPPER, _LETTERS_LOWER, _LETTERS_ALL + + +class GenerateString(TextGenerator): # lgtm [py/missing-equals] + """This class handles the generation of string text of specified length drawn from alphanumeric characters. + + The set of chars to be used can be modified based on the parameters + + This will generate deterministic strings chosen from the pool of characters `0-9`, `a-z`, `A-Z`, or from a + custom character range if specified. + + :param length: length of string. Can be integer, or tuple (min, max) + :param leadingAlpha: If True, leading character will be in range a-zAA-Z + :param allUpper: If True, any alpha chars will be uppercase + :param allLower: If True, any alpha chars will be lowercase + :param allAlpha: If True, all chars will be non numeric + :param customChars: If supplied, specifies a list of chars to use, or string of chars to use. + + This method will generate deterministic strings varying in size from `minLength` to `maxLength`. + The characters chosen will be in the range 0-9`, `a-z`, `A-Z` unless modified using the `leadingAlpha`, + `allUpper`, `allLower`, `allAlpha` or `customChars` parameters. + + The modifiers can be combined - for example GenerateString(1, 5, leadingAlpha=True, allUpper=True) + + When the length is specified to be a tuple, it wll generate variable length strings of lengths from the lower bound + to the upper bound inclusive. + + The strings are generated deterministically so that they can be used for predictable primary and foreign keys. + + If the column definition that includes this specifies `random` then the string generation will be determined by a + seeded random number according to the rules for random numbers and random seeds used in other columns + + If random is false, then the string will be generated from a pseudo random sequence generated purely from the + SQL hash of the `baseColumns` + + .. note:: + If customChars are specified, then the flag `allAlpha` will only remove digits. + + """ + + def __init__(self, length, leadingAlpha=True, allUpper=False, allLower=False, allAlpha=False, customChars=None): + super().__init__() + + assert not customChars or isinstance(customChars, (list, str)), \ + "`customChars` should be list of characters or string containing custom chars" + + assert not allUpper or not allLower, "allUpper and allLower cannot both be True" + + if isinstance(customChars, str): + assert len(customChars) > 0, "string of customChars must be non-empty" + elif isinstance(customChars, list): + assert all(isinstance(c, str) for c in customChars) + assert len(customChars) > 0, "list of customChars must be non-empty" + + self.leadingAlpha = leadingAlpha + self.allUpper = allUpper + self.allLower = allLower + self.allAlpha = allAlpha + + # determine base alphabet + if isinstance(customChars, list): + charAlphabet = set("".join(customChars)) + elif isinstance(customChars, str): + charAlphabet = set(customChars) + else: + charAlphabet = set(_LETTERS_ALL).union(set(_DIGITS_ZERO)) + + if allLower: + charAlphabet = charAlphabet.difference(set(_LETTERS_UPPER)) + elif allUpper: + charAlphabet = charAlphabet.difference(set(_LETTERS_LOWER)) + + if allAlpha: + charAlphabet = charAlphabet.difference(set(_DIGITS_ZERO)) + + self._charAlphabet = np.array(list(charAlphabet)) + + if leadingAlpha: + self._firstCharAlphabet = np.array(list(charAlphabet.difference(set(_DIGITS_ZERO)))) + else: + self._firstCharAlphabet = self._charAlphabet + + # compute string lengths + if isinstance(length, int): + self._minLength = length + self._maxLength = length + elif isinstance(length, tuple): + assert len(length) == 2, "only 2 elements can be specified if length is a tuple" + assert all(isinstance(el, int) for el in length) + self._minLength, self._maxLength = length + else: + raise ValueError("`length` must be an integer or a tuple of two integers") + + # compute bounds for generated strings + bounds = [len(self._firstCharAlphabet)] + for ix in range(1, self._maxLength): + bounds.append(len(self._charAlphabet)) + + self._bounds = bounds + + def __repr__(self): + return f"GenerateString(length={(self._minLength, self._maxLength)}, leadingAlpha={self.leadingAlpha})" + + def make_variable_length_mask(self, v, lengths): + """ given 2-d array of dimensions[r, c] and lengths of dimensions[r] + + generate mask for each row where col_index[r,c] < lengths[r] + """ + print(v.shape, lengths.shape) + assert v.shape[0] == lengths.shape[0], "values and lengths must agree on dimension 0]" + _, c_ix = np.indices(v.shape) + + return (c_ix.T < lengths.T).T + + def mk_bounds(self, v, minLength, maxLength): + rng = np.random.default_rng(42) + v_bounds = np.full(v.shape[0], (maxLength - minLength) + 1) + return rng.integers(v_bounds) + minLength + + def prepareBaseValue(self, baseDef): + """ Prepare the base value for processing + + :param baseDef: base value expression + :return: base value expression unchanged + + For generate string processing , we'll use the SQL function abs(hash(baseDef) + + This will ensure that even if there are multiple base values, only a single value is passed to the UDF + """ + return F.abs(F.hash(baseDef)) + + def pandasGenerateText(self, v): + """ entry point to use for pandas udfs + + Implementation uses vectorized implementation of process + + :param v: Pandas series of values passed as base values + :return: Pandas series of expanded templates + + """ + # placeholders is numpy array used to hold results + + rnds = np.full((v.shape[0], self._maxLength), len(self._charAlphabet), dtype=np.object_) + + rng = self.getNPRandomGenerator() + rnds2 = rng.integers(rnds) + + placeholders = np.full((v.shape[0], self._maxLength), '', dtype=np.object_) + + lengths = v.to_numpy() % (self._maxLength - self._minLength) + self._minLength + + v1 = np.full((v.shape[0], self._maxLength), -1) + + placeholder_mask = self.make_variable_length_mask(placeholders, lengths) + masked_placeholders = np.ma.MaskedArray(placeholders, mask=placeholder_mask) + + masked_placeholders[~placeholder_mask] = self._charAlphabet[rnds2[~placeholder_mask]] + + output = pd.Series(list(placeholders)) + + # join strings in placeholders + results = output.apply(lambda placeholder_items: "".join([str(elem) for elem in placeholder_items])) + + return results diff --git a/dbldatagen/text_generators.py b/dbldatagen/text_generators.py index b76a36fd..403e06d8 100644 --- a/dbldatagen/text_generators.py +++ b/dbldatagen/text_generators.py @@ -60,12 +60,6 @@ 'PROIDENT', 'SUNT', 'IN', 'CULPA', 'QUI', 'OFFICIA', 'DESERUNT', 'MOLLIT', 'ANIM', 'ID', 'EST', 'LABORUM'] -# spacing smbols - note that order is important -SPACE_SYMBOLS = ["\n\n", " ", ""] - -# default punctuation symbols -DEFAULT_PUNCTUATION = [". "] - class TextGenerator(object): """ Base class for text generation classes @@ -169,8 +163,10 @@ def getAsTupleOrElse(v, defaultValue, valueName): def prepareBaseValue(self, baseDef): """ Prepare the base value for processing + :param baseDef: base value expression :return: base value expression unchanged + Derived classes are expected to override this if needed """ return baseDef @@ -682,16 +678,10 @@ class ILText(TextGenerator): # lgtm [py/missing-equals] :param paragraphs: Number of paragraphs to generate. If tuple will generate random number in range :param sentences: Number of sentences to generate. If tuple will generate random number in tuple range :param words: Number of words per sentence to generate. If tuple, will generate random number in tuple range - :param punctuation: List of strings or single string of punctuation to use - - If `punctuation` is not specified, the following punctuation will be used: [". "] - - Note that punctuation will only be used at end of each logical sentence, not at end of each word. - """ - def __init__(self, paragraphs=None, sentences=None, words=None, extendedWordList=None, punctuation=None): + def __init__(self, paragraphs=None, sentences=None, words=None, extendedWordList=None): """ Initialize the ILText with text generation parameters """ @@ -705,7 +695,6 @@ def __init__(self, paragraphs=None, sentences=None, words=None, extendedWordList self.sentences = self.getAsTupleOrElse(sentences, (1, 1), "sentences") self.wordList = extendedWordList if extendedWordList is not None else _WORDS_LOWER self.shape = [self.paragraphs[1], self.sentences[1], self.words[1]] - self.punctuation = np.array(punctuation) if punctuation is not None else np.array(DEFAULT_PUNCTUATION) # values needed for the text generation # numpy uses fixed sizes for strings , so compute whats needed @@ -738,15 +727,11 @@ def _processWordList(self): self._wordOffsetSize = all_words.size self._sentenceEndOffset = all_words.size - - punctuation = [] - punctuation.extend(self.punctuation) - punctuation.extend(SPACE_SYMBOLS) - - self._paragraphEnd = self._sentenceEndOffset + len(self.punctuation) + self._paragraphEnd = self._sentenceEndOffset + 1 self._wordSpaceOffset = self._paragraphEnd + 1 self._emptyStringOffset = self._wordSpaceOffset + 1 + punctuation = [". ", "\n\n", " ", ""] all_words = np.concatenate((all_words, punctuation)) self._startOfCapitalsOffset = all_words.size @@ -845,8 +830,6 @@ def generateText(self, baseValues, rowCount=1): new_col = new_word_offsets[:, :, :, np.newaxis] terminated_word_offsets = np.ma.concatenate((masked_offsets, new_col), axis=3) new_column = terminated_word_offsets[:, :, :, -1] - - # TODO: modify to add punctuation to end of sentences new_column[~new_column.mask] = self._sentenceEndOffset # reshape to paragraphs diff --git a/docs/utils/mk_quick_index.py b/docs/utils/mk_quick_index.py index c3d08953..524aaba9 100644 --- a/docs/utils/mk_quick_index.py +++ b/docs/utils/mk_quick_index.py @@ -33,6 +33,8 @@ "grouping": "main classes"}, "text_generator_plugins.py": {"briefDesc": "Text data generation", "grouping": "main classes"}, + "text_generatestring.py": {"briefDesc": "Text data generation", + "grouping": "main classes"}, "data_analyzer.py": {"briefDesc": "Analysis of existing data", "grouping": "main classes"}, "function_builder.py": {"briefDesc": "Internal utilities to create functions related to weights", diff --git a/tests/test_text_generatestring.py b/tests/test_text_generatestring.py new file mode 100644 index 00000000..9d6fc599 --- /dev/null +++ b/tests/test_text_generatestring.py @@ -0,0 +1,99 @@ +import pytest +import pyspark.sql.functions as F +from pyspark.sql.types import BooleanType, DateType +from pyspark.sql.types import StructType, StructField, IntegerType, StringType, TimestampType + +import dbldatagen as dg + +spark = dg.SparkSingleton.getLocalInstance("unit tests") + +spark.conf.set("spark.sql.execution.arrow.maxRecordsPerBatch", "20000") +spark.conf.set("spark.sql.execution.arrow.pyspark.enabled", "true") + +#: list of digits for template generation +_DIGITS_ZERO = ['0', '1', '2', '3', '4', '5', '6', '7', '8', '9'] + +#: list of uppercase letters for template generation +_LETTERS_UPPER = ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', + 'Q', 'R', 'T', 'S', 'U', 'V', 'W', 'X', 'Y', 'Z'] + +#: list of lowercase letters for template generation +_LETTERS_LOWER = ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', + 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z'] + +#: list of all letters uppercase and lowercase +_LETTERS_ALL = _LETTERS_LOWER + _LETTERS_UPPER + +#: list of alphanumeric chars in lowercase +_ALNUM_LOWER = _LETTERS_LOWER + _DIGITS_ZERO + +#: list of alphanumeric chars in uppercase +_ALNUM_UPPER = _LETTERS_UPPER + _DIGITS_ZERO + + +# Test manipulation and generation of test data for a large schema +class TestTextGenerateString: + + @pytest.mark.parametrize("length, leadingAlpha, allUpper, allLower, allAlpha, customChars", + [ + (5, True, True, False, False, None), + (5, True, False, True, False, None), + (5, True, False, False, True, None), + (5, False, False, False, False, None), + (5, False, True, False, True, None), + (5, False, False, True, True, None), + (5, False, False, False, False, "01234567890ABCDEF"), + ]) + def test_basics(self, length, leadingAlpha, allUpper, allLower, allAlpha, customChars): + + tg1 = dg.GenerateString(length, leadingAlpha=leadingAlpha, allUpper=allUpper, allLower=allLower, + allAlpha=allAlpha, customChars=customChars) + + assert tg1._charAlphabet is not None + assert tg1._firstCharAlphabet is not None + + if allUpper and allAlpha: + alphabet = _LETTERS_UPPER + elif allLower and allAlpha: + alphabet = _LETTERS_LOWER + elif allLower: + alphabet = _LETTERS_LOWER + _DIGITS_ZERO + elif allUpper: + alphabet = _LETTERS_UPPER + _DIGITS_ZERO + elif allAlpha: + alphabet = _LETTERS_UPPER + _LETTERS_LOWER + else: + alphabet = _LETTERS_UPPER + _LETTERS_LOWER + _DIGITS_ZERO + + if customChars is not None: + alphabet = set(alphabet).intersection(set(customChars)) + + assert set(tg1._charAlphabet) == set(alphabet) + + @pytest.mark.parametrize("genstr", + [ + dg.GenerateString((1, 10)), + dg.GenerateString((1, 10), leadingAlpha=True), + dg.GenerateString((4, 64), allUpper=True), + dg.GenerateString((10, 20), allLower=True), + dg.GenerateString((1, 10)), + dg.GenerateString((3, 15)), + dg.GenerateString((17, 22)), + dg.GenerateString((1, 10)), + ]) + def test_simple_data(self, genstr): + dgspec = (dg.DataGenerator(sparkSession=spark, name="alt_data_set", rows=10000, + partitions=4, seedMethod='hash_fieldname', verbose=True, + seedColumnName="_id") + .withIdOutput() + .withColumn("code2", IntegerType(), min=0, max=10) + .withColumn("code3", StringType(), values=['a', 'b', 'c']) + .withColumn("code4", StringType(), values=['a', 'b', 'c'], random=True) + .withColumn("code5", StringType(), text=dg.GenerateString((1, 10))) + ) + + fieldsFromGenerator = set(dgspec.getOutputColumnNames()) + + df_testdata = dgspec.build() + + df_testdata.show() From df4dd558fafa340d3bebbe1ef6f45175f6c6cc4c Mon Sep 17 00:00:00 2001 From: ronanstokes-db Date: Thu, 1 Jun 2023 02:04:32 -0700 Subject: [PATCH 19/20] wip --- dbldatagen/__init__.py | 2 +- makefile | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/dbldatagen/__init__.py b/dbldatagen/__init__.py index 101b3f84..69956112 100644 --- a/dbldatagen/__init__.py +++ b/dbldatagen/__init__.py @@ -47,7 +47,7 @@ __all__ = ["data_generator", "data_analyzer", "schema_parser", "daterange", "nrange", "column_generation_spec", "utils", "function_builder", "spark_singleton", "text_generators", "datarange", "datagen_constants", - "text_generator_plugins", "html_utils", "text_generatestring", "value_based_prng" + "text_generator_plugins", "html_utils", "text_generatestring" ] diff --git a/makefile b/makefile index 7c006b3e..d16fb159 100644 --- a/makefile +++ b/makefile @@ -89,11 +89,11 @@ dev-test: dev-lint-report: @echo "$(OK_COLOR)=> Running Prospector lint reporting $(PWD) $(NO_COLOR)" - prospector --profile prospector.yaml > prospector_report.txt + prospector --profile prospector.yaml dbldatagen > prospector_report.txt dev-lint: @echo "$(OK_COLOR)=> Running Prospector lint reporting $(PWD) $(NO_COLOR)" - prospector --profile prospector.yaml + prospector --profile prospector.yaml dbldatagen dev-test-with-html-report: @echo "$(OK_COLOR)=> Running unit tests with HTML test coverage report$(NO_COLOR)" From 0a93d0f646863db9041a08aaab5470751abcd3f0 Mon Sep 17 00:00:00 2001 From: ronanstokes-db Date: Thu, 1 Jun 2023 02:26:06 -0700 Subject: [PATCH 20/20] wip --- tests/test_generation_from_data.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/tests/test_generation_from_data.py b/tests/test_generation_from_data.py index fab15809..25c36aa1 100644 --- a/tests/test_generation_from_data.py +++ b/tests/test_generation_from_data.py @@ -71,6 +71,9 @@ def test_code_generation1(self, generation_spec, setupLogging): ast_tree = ast.parse(generatedCode) assert ast_tree is not None + generatedCode2 = analyzer.scriptDataGeneratorFromData(asHtml=True) + assert generatedCode in generatedCode2 + def test_code_generation_from_schema(self, generation_spec, setupLogging): df_source_data = generation_spec.build() generatedCode = dg.DataAnalyzer.scriptDataGeneratorFromSchema(df_source_data.schema) @@ -82,6 +85,10 @@ def test_code_generation_from_schema(self, generation_spec, setupLogging): ast_tree = ast.parse(generatedCode) assert ast_tree is not None + generatedCode2 = dg.DataAnalyzer.scriptDataGeneratorFromSchema(df_source_data.schema, asHtml=True) + + assert generatedCode in generatedCode2 + def test_summarize(self, testLogger, generation_spec): testLogger.info("Building test data")