Skip to content

Commit 6d58ce1

Browse files
committed
Semantic profiler and report generation module integration
Added the modules for generating the report based on the syntactic and semantic feature present in the code Signed-off-by: Pankaj Thorat <[email protected]>
1 parent 995bfc6 commit 6d58ce1

File tree

48 files changed

+70087
-2753
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

48 files changed

+70087
-2753
lines changed

transforms/code/code_profiler/README.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -61,3 +61,5 @@ The high-level system design is as follows:
6161
For each new target language, the offline phase is utilized to create deterministic rules by harnessing the capabilities of LLMs and working with exemplar code samples from the target language. In this process, Workflow W1 facilitates the creation of rules around syntactic structures based on exemplar code samples, while Workflow W2 is used to establish semantic dimensions for profiling. Subsequently, we derive rules that connect syntactic constructs to the predefined semantic concepts. These rules are then stored in a rule database, ready to be employed during the online phase.
6262

6363
In the online phase, the system dynamically generates profiling outputs for any incoming code snippets. This is achieved by extracting concepts from the snippets using the rules in the database and storing these extractions in a tabular format. The structured tabular format allows for generating additional concept columns, which are then utilized to create comprehensive profiling reports.
64+
65+
Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
{
22
"input": "multi-package.parquet",
3-
"contents": "Contents",
4-
"language": "Language"
3+
"dynamic_schema_mapping": "True",
4+
"contents": "contents",
5+
"language": "language"
56
}
Binary file not shown.
Binary file not shown.
Binary file not shown.

transforms/code/code_profiler/notebook_example/code-profiler.ipynb

Lines changed: 555 additions & 987 deletions
Large diffs are not rendered by default.

transforms/code/code_profiler/python/Makefile

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -35,7 +35,7 @@ setup:: .transforms.setup
3535
set-versions:
3636
$(MAKE) TRANSFORM_PYTHON_VERSION=$(CODE_PROFILER_PYTHON_VERSION) TOML_VERSION=$(CODE_PROFILER_PYTHON_VERSION) .transforms.set-versions
3737

38-
build-dist:: .defaults.build-dist
38+
build-dist:: .defaults.build-dist
3939

4040
publish-dist:: .defaults.publish-dist
4141

@@ -51,5 +51,5 @@ run-local-sample: .transforms.run-local-sample
5151

5252
run-local-python-sample:
5353
$(MAKE) RUN_FILE=code_profiler_local_python.py \
54-
RUN_ARGS="--content 'Contents' --language 'Language'" \
54+
RUN_ARGS="--content 'contents' --language 'language'" \
5555
.transforms.run-local-python-sample

transforms/code/code_profiler/python/src/UAST_parser.py

Lines changed: 31 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -228,8 +228,9 @@ def _add_user_defined(self, node):
228228
return
229229

230230
# Traversing through the AST to create nodes recursively.
231-
def _dfs(self, AST_node, parent) :
232-
if (AST_node.type in self.rules) :
231+
def _dfs(self, AST_node, parent):
232+
233+
if (AST_node.type in self.rules):
233234
ast_snippet = AST_node.text.decode("utf8")
234235
node_type = self.rules[AST_node.type]["uast_node_type"]
235236
exec_string = self.rules[AST_node.type]["extractor"]
@@ -269,3 +270,31 @@ def _extract(self, ast_snippet, node_type, exec_string):
269270
return self.grammar[node_type]["keyword"] + " " + self.extracted
270271
except Exception as e:
271272
print(e)
273+
274+
def uast_read(jsonstring):
275+
"""
276+
Reads an input json string into UAST class object
277+
"""
278+
uast = UAST()
279+
if jsonstring is not None and jsonstring != 'null':
280+
uast.load_from_json_string(jsonstring)
281+
return uast
282+
return None
283+
284+
def extract_ccr(uast):
285+
"""
286+
Calculates the code to comment ratio given an UAST object as input
287+
"""
288+
if uast is not None:
289+
total_comment_loc = 0
290+
for node_idx in uast.nodes:
291+
node = uast.get_node(node_idx)
292+
if node.node_type == 'uast_comment':
293+
total_comment_loc += node.metadata.get("loc_original_code", 0)
294+
elif node.node_type == 'uast_root':
295+
loc_snippet = node.metadata.get("loc_snippet", 0)
296+
if total_comment_loc > 0:
297+
return loc_snippet / total_comment_loc
298+
else:
299+
return None
300+
return None

transforms/code/code_profiler/python/src/code_profiler_local_python.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -24,8 +24,8 @@
2424
local_conf = {
2525
"input_folder": input_folder,
2626
"output_folder": output_folder,
27-
"contents": "Contents",
28-
"language": "Language"
27+
"contents": "contents",
28+
"language": "language"
2929
}
3030
params = {
3131
# Data access. Only required parameters are specified

transforms/code/code_profiler/python/src/code_profiler_transform.py

Lines changed: 95 additions & 60 deletions
Original file line numberDiff line numberDiff line change
@@ -40,10 +40,8 @@
4040

4141
short_name = "CodeProfiler"
4242
cli_prefix = f"{short_name}_"
43-
language_key = "language"
44-
contents_key = "contents"
45-
language_cli_param = f"{cli_prefix}{language_key}"
46-
contents_cli_param = f"{cli_prefix}{contents_key}"
43+
language = "language"
44+
contents = "contents"
4745

4846
class CodeProfilerTransform(AbstractTableTransform):
4947
"""
@@ -57,8 +55,11 @@ def __init__(self, config: dict[str, Any]):
5755

5856
super().__init__(config)
5957

60-
self.contents = self.config.get("contents")
61-
self.language = self.config.get("language")
58+
self.contents = self.config.get("contents", "contents")
59+
self.language = self.config.get("language", "language")
60+
61+
if not isinstance(self.contents, str):
62+
raise ValueError(f"'contents' should be a string, got {type(self.contents).__name__}")
6263

6364
def ensure_tree_sitter_bindings():
6465
# Get the directory where the script is located
@@ -148,23 +149,46 @@ def ensure_tree_sitter_bindings():
148149
self.ikb_file = config.get("ikb_file", "semantic-ruleset/ikb_model.csv")
149150
self.null_libs_file = config.get("null_libs_file", "semantic-ruleset/null_libs.csv")
150151

152+
src_file_dir = os.path.abspath(os.path.dirname(__file__))
153+
# Check if the file exists; if not, update the default path
154+
if not os.path.exists(self.ikb_file):
155+
print(f"File not found at {self.ikb_file}. Updating to '../semantic-ruleset/ikb_model.csv'")
156+
self.ikb_file = os.path.join(src_file_dir, "semantic-ruleset/ikb_model.csv")
157+
# Raise an error if the file still doesn't exist
158+
if not os.path.exists(self.ikb_file):
159+
raise FileNotFoundError(f"File not found: {self.ikb_file}")
160+
161+
# Check if the file exists; if not, update the default path
162+
if not os.path.exists(self.null_libs_file):
163+
print(f"File not found at {self.null_libs_file}. Updating to '../semantic-ruleset/null_libs.csv'")
164+
self.null_libs_file = os.path.join(src_file_dir, "semantic-ruleset/null_libs.csv")
165+
# Raise an error if the file still doesn't exist
166+
if not os.path.exists(self.null_libs_file):
167+
raise FileNotFoundError(f"File not found: {self.null_libs_file}")
168+
151169
# Higher order semantic features
152-
self.metrics_list = config.get("metrics_list", ["CCR"])
170+
self.metrics_list = config.get("metrics_list", ["CCR", "code_snippet_len", "avg_fn_len_in_snippet"])
153171

154172
def transform(self, table: pa.Table, file_name: str = None) -> tuple[list[pa.Table], dict[str, Any]]:
155173
"""
156174
Extracts the syntactic constructs
157175
"""
158-
print("tranforming the the input dataframe")
176+
print("Transforming the the input dataframe")
159177

160178
ts_parser = TSParser()
161179
uast_parser = UASTParser()
162180

163181
def get_uast_json(code, lang):
164-
if lang in self.language_map:
165-
ts_parser.set_language(self.language_map[lang])
166-
uast_parser.set_language(self.uast_language_map[lang])
167-
ast = ts_parser.parse(bytes(code, encoding= "utf8"))
182+
# Create case-insensitive mappings
183+
language_map_lower = {key.lower(): value for key, value in self.language_map.items()}
184+
uast_language_map_lower = {key.lower(): value for key, value in self.uast_language_map.items()}
185+
186+
# Check for the lowercase version of `lang`
187+
lang_lower = lang.lower()
188+
if lang_lower in language_map_lower:
189+
ts_parser.set_language(language_map_lower[lang_lower])
190+
uast_parser.set_language(uast_language_map_lower[lang_lower])
191+
ast = ts_parser.parse(bytes(code, encoding="utf8"))
168192
uast = uast_parser.parse(ast, code)
169193
return uast.get_json()
170194
return None
@@ -175,8 +199,12 @@ def extract_packages_from_uast(uast_json):
175199

176200
try:
177201
uast_data = json.loads(uast_json)
178-
nodes = uast_data.get("nodes", {})
179-
202+
if uast_data is not None:
203+
nodes = uast_data.get("nodes", {})
204+
else:
205+
nodes = {}
206+
print("Warning: uast_data is None. Check the data source or initialization process.")
207+
return
180208
# Iterate through nodes to find nodes with type 'uast_package'
181209
for node_id, node_data in nodes.items():
182210
if node_data.get("node_type") == "uast_package":
@@ -189,13 +217,14 @@ def extract_packages_from_uast(uast_json):
189217

190218
return ",".join(package_list) # Return as a comma-separated string
191219

192-
def get_uast_parquet():
220+
def get_uast_parquet(tmp_table):
193221
# df = pd.read_parquet(f'{db_path}/{filename}', 'pyarrow')
194222
# df = df.reindex(columns=all_columns)
195223

196224
# Extract language and content arrays from the table using PyArrow
197-
lang_array = table.column(self.language)
198-
content_array = table.column(self.contents)
225+
print(self.language)
226+
lang_array = tmp_table.column(self.language)
227+
content_array = tmp_table.column(self.contents)
199228
# Ensure both arrays have the same length
200229
assert len(lang_array) == len(content_array)
201230

@@ -208,68 +237,74 @@ def get_uast_parquet():
208237
uast_column = pa.array(uasts)
209238
package_list_column = pa.array(package_lists)
210239

211-
table_with_uast = table.append_column('UAST', uast_column)
240+
tmp_table_with_uast = tmp_table.append_column('UAST', uast_column)
212241
# Add the uast_package column
213-
table_with_package_list = table_with_uast.append_column('UAST_Package_List', package_list_column)
242+
table_with_package_list = tmp_table_with_uast.append_column('UAST_Package_List', package_list_column)
214243
return table_with_package_list
215244

216-
# Custom cleanup function
217-
def safe_rmtree(path):
218-
if os.path.exists(path):
219-
shutil.rmtree(path)
220-
221-
table_with_uast = get_uast_parquet()
222-
# report statistics
223-
stats = {"source_documents": table.num_columns, "result_documents": table_with_uast.num_columns}
245+
table_with_uast = get_uast_parquet(table)
246+
247+
try:
248+
# Use an OS command to remove the folder and its contents
249+
subprocess.run(["rm", "-rf", self.bindings_dir], check=True)
250+
print(f"Successfully deleted: {self.bindings_dir}")
251+
except subprocess.CalledProcessError as e:
252+
print(f"Error deleting {self.bindings_dir}: {e}")
224253

225254
## Semantic profiling
226-
table = table_with_uast
227-
self.logger.debug(f"Semantic profiling of one table with {len(table)} rows")
255+
self.logger.debug(f"Semantic profiling of one table with {len(table_with_uast)} rows")
228256

229257
# Load Knowledge Base
258+
print(self.ikb_file)
259+
print(self.null_libs_file)
230260
ikb = knowledge_base(self.ikb_file, self.null_libs_file)
231261
ikb.load_ikb_trie()
232262

233263
# Extract concept from IKB
234-
libraries = table.column('UAST_Package_List').to_pylist()
235-
language = table.column('Language').to_pylist()
264+
libraries = table_with_uast.column('UAST_Package_List').to_pylist()
265+
language = table_with_uast.column('language').to_pylist()
236266
concepts = [concept_extractor(lib, lang, ikb) for lib, lang in zip(libraries, language)]
237267

238268
# Append concepts column to table and record unknown libraries
239269
new_col = pa.array(concepts)
240-
table = table.append_column('Concepts', new_col)
270+
table_with_uast = table_with_uast.append_column('Concepts', new_col)
241271
ikb.write_null_files()
242272

243273
# Higher order syntactic profiler
244-
self.logger.debug(f"Transforming one table with {len(table)} rows")
274+
self.logger.debug(f"Transforming one table with {len(table_with_uast)} rows")
245275

246276
if self.metrics_list is not None:
247-
for metric in self.metrics_list:
248-
if metric == "CCR":
249-
self.logger.info(f"Generating {metric} values")
250-
uasts = [uast_read(uast_json) for uast_json in table['UAST'].to_pylist()]
251-
ccrs = [extract_ccr(uast) for uast in uasts]
252-
new_table = table.append_column(metric, pa.array(ccrs))
253-
254-
self.logger.debug(f"Transformed one table with {len(new_table)} rows")
255-
metadata = {"nfiles": 1, "nrows": len(new_table)}
256-
277+
uasts = [uast_read(uast_json) for uast_json in table_with_uast['UAST'].to_pylist()]
278+
ccrs = []
279+
code_snippet_len = []
280+
avg_fn_len_in_snippet = []
281+
282+
for uast in uasts:
283+
if "CCR" in self.metrics_list:
284+
ccrs.append(extract_ccr(uast))
285+
if "code_snippet_len" in self.metrics_list:
286+
code_snippet_len.append(extract_code_snippet_length(uast))
287+
if "avg_fn_len_in_snippet" in self.metrics_list:
288+
avg_fn_len_in_snippet.append(extract_code_avg_fn_len_in_snippet(uast))
289+
290+
if "CCR" in self.metrics_list:
291+
table_with_uast = table_with_uast.append_column("CCR", pa.array(ccrs))
292+
if "code_snippet_len" in self.metrics_list:
293+
table_with_uast = table_with_uast.append_column("code_snippet_len", pa.array(code_snippet_len))
294+
if "avg_fn_len_in_snippet" in self.metrics_list:
295+
table_with_uast = table_with_uast.append_column("avg_fn_len_in_snippet", pa.array(avg_fn_len_in_snippet))
296+
297+
self.logger.debug(f"Transformed one table with {len(table_with_uast)} rows")
298+
metadata = {"nfiles": 1, "nrows": len(table_with_uast)}
257299
# Report generation
258-
if 'UAST' in new_table.schema.names and 'Concepts' in new_table.schema.names:
259-
generate_report(new_table,self.metrics_list)
300+
if 'UAST' in table_with_uast.schema.names and 'Concepts' in table_with_uast.schema.names:
301+
generate_report(table_with_uast,self.metrics_list)
260302

261303
# Add some sample metadata.
262-
self.logger.debug(f"Transformed one table with {len(table)} rows")
263-
stats["nrows"] = len(table)
264-
265-
try:
266-
# Use an OS command to remove the folder and its contents
267-
subprocess.run(["rm", "-rf", self.bindings_dir], check=True)
268-
print(f"Successfully deleted: {self.bindings_dir}")
269-
except subprocess.CalledProcessError as e:
270-
print(f"Error deleting {self.bindings_dir}: {e}")
271-
272-
return [table], stats
304+
self.logger.debug(f"Transformed one table with {len(table_with_uast)} rows")
305+
# report statistics
306+
stats = {"source_documents": table.num_columns, "result_documents": table_with_uast.num_columns}
307+
return [table_with_uast], stats
273308

274309
class CodeProfilerTransformConfiguration(TransformConfiguration):
275310
def __init__(self, transform_class: type[AbstractBinaryTransform] = CodeProfilerTransform):
@@ -279,15 +314,15 @@ def __init__(self, transform_class: type[AbstractBinaryTransform] = CodeProfiler
279314
)
280315
def add_input_params(self, parser: ArgumentParser) -> None:
281316
parser.add_argument(
282-
f"--{language_cli_param}",
317+
f"--{language}",
283318
type=str,
284-
default="Language",
319+
default="language",
285320
help="Column name that denotes the programming language",
286321
)
287322
parser.add_argument(
288-
f"--{contents_cli_param}",
323+
f"--{contents}",
289324
type=str,
290-
default="Contents",
325+
default="contents",
291326
help="Column name that contains code snippets",
292327
)
293328

0 commit comments

Comments
 (0)