40
40
41
41
short_name = "CodeProfiler"
42
42
cli_prefix = f"{ short_name } _"
43
- language_key = "language"
44
- contents_key = "contents"
45
- language_cli_param = f"{ cli_prefix } { language_key } "
46
- contents_cli_param = f"{ cli_prefix } { contents_key } "
43
+ language = "language"
44
+ contents = "contents"
47
45
48
46
class CodeProfilerTransform (AbstractTableTransform ):
49
47
"""
@@ -57,8 +55,11 @@ def __init__(self, config: dict[str, Any]):
57
55
58
56
super ().__init__ (config )
59
57
60
- self .contents = self .config .get ("contents" )
61
- self .language = self .config .get ("language" )
58
+ self .contents = self .config .get ("contents" , "contents" )
59
+ self .language = self .config .get ("language" , "language" )
60
+
61
+ if not isinstance (self .contents , str ):
62
+ raise ValueError (f"'contents' should be a string, got { type (self .contents ).__name__ } " )
62
63
63
64
def ensure_tree_sitter_bindings ():
64
65
# Get the directory where the script is located
@@ -148,23 +149,46 @@ def ensure_tree_sitter_bindings():
148
149
self .ikb_file = config .get ("ikb_file" , "semantic-ruleset/ikb_model.csv" )
149
150
self .null_libs_file = config .get ("null_libs_file" , "semantic-ruleset/null_libs.csv" )
150
151
152
+ src_file_dir = os .path .abspath (os .path .dirname (__file__ ))
153
+ # Check if the file exists; if not, update the default path
154
+ if not os .path .exists (self .ikb_file ):
155
+ print (f"File not found at { self .ikb_file } . Updating to '../semantic-ruleset/ikb_model.csv'" )
156
+ self .ikb_file = os .path .join (src_file_dir , "semantic-ruleset/ikb_model.csv" )
157
+ # Raise an error if the file still doesn't exist
158
+ if not os .path .exists (self .ikb_file ):
159
+ raise FileNotFoundError (f"File not found: { self .ikb_file } " )
160
+
161
+ # Check if the file exists; if not, update the default path
162
+ if not os .path .exists (self .null_libs_file ):
163
+ print (f"File not found at { self .null_libs_file } . Updating to '../semantic-ruleset/null_libs.csv'" )
164
+ self .null_libs_file = os .path .join (src_file_dir , "semantic-ruleset/null_libs.csv" )
165
+ # Raise an error if the file still doesn't exist
166
+ if not os .path .exists (self .null_libs_file ):
167
+ raise FileNotFoundError (f"File not found: { self .null_libs_file } " )
168
+
151
169
# Higher order semantic features
152
- self .metrics_list = config .get ("metrics_list" , ["CCR" ])
170
+ self .metrics_list = config .get ("metrics_list" , ["CCR" , "code_snippet_len" , "avg_fn_len_in_snippet" ])
153
171
154
172
def transform (self , table : pa .Table , file_name : str = None ) -> tuple [list [pa .Table ], dict [str , Any ]]:
155
173
"""
156
174
Extracts the syntactic constructs
157
175
"""
158
- print ("tranforming the the input dataframe" )
176
+ print ("Transforming the the input dataframe" )
159
177
160
178
ts_parser = TSParser ()
161
179
uast_parser = UASTParser ()
162
180
163
181
def get_uast_json (code , lang ):
164
- if lang in self .language_map :
165
- ts_parser .set_language (self .language_map [lang ])
166
- uast_parser .set_language (self .uast_language_map [lang ])
167
- ast = ts_parser .parse (bytes (code , encoding = "utf8" ))
182
+ # Create case-insensitive mappings
183
+ language_map_lower = {key .lower (): value for key , value in self .language_map .items ()}
184
+ uast_language_map_lower = {key .lower (): value for key , value in self .uast_language_map .items ()}
185
+
186
+ # Check for the lowercase version of `lang`
187
+ lang_lower = lang .lower ()
188
+ if lang_lower in language_map_lower :
189
+ ts_parser .set_language (language_map_lower [lang_lower ])
190
+ uast_parser .set_language (uast_language_map_lower [lang_lower ])
191
+ ast = ts_parser .parse (bytes (code , encoding = "utf8" ))
168
192
uast = uast_parser .parse (ast , code )
169
193
return uast .get_json ()
170
194
return None
@@ -175,8 +199,12 @@ def extract_packages_from_uast(uast_json):
175
199
176
200
try :
177
201
uast_data = json .loads (uast_json )
178
- nodes = uast_data .get ("nodes" , {})
179
-
202
+ if uast_data is not None :
203
+ nodes = uast_data .get ("nodes" , {})
204
+ else :
205
+ nodes = {}
206
+ print ("Warning: uast_data is None. Check the data source or initialization process." )
207
+ return
180
208
# Iterate through nodes to find nodes with type 'uast_package'
181
209
for node_id , node_data in nodes .items ():
182
210
if node_data .get ("node_type" ) == "uast_package" :
@@ -189,13 +217,14 @@ def extract_packages_from_uast(uast_json):
189
217
190
218
return "," .join (package_list ) # Return as a comma-separated string
191
219
192
- def get_uast_parquet ():
220
+ def get_uast_parquet (tmp_table ):
193
221
# df = pd.read_parquet(f'{db_path}/{filename}', 'pyarrow')
194
222
# df = df.reindex(columns=all_columns)
195
223
196
224
# Extract language and content arrays from the table using PyArrow
197
- lang_array = table .column (self .language )
198
- content_array = table .column (self .contents )
225
+ print (self .language )
226
+ lang_array = tmp_table .column (self .language )
227
+ content_array = tmp_table .column (self .contents )
199
228
# Ensure both arrays have the same length
200
229
assert len (lang_array ) == len (content_array )
201
230
@@ -208,68 +237,74 @@ def get_uast_parquet():
208
237
uast_column = pa .array (uasts )
209
238
package_list_column = pa .array (package_lists )
210
239
211
- table_with_uast = table .append_column ('UAST' , uast_column )
240
+ tmp_table_with_uast = tmp_table .append_column ('UAST' , uast_column )
212
241
# Add the uast_package column
213
- table_with_package_list = table_with_uast .append_column ('UAST_Package_List' , package_list_column )
242
+ table_with_package_list = tmp_table_with_uast .append_column ('UAST_Package_List' , package_list_column )
214
243
return table_with_package_list
215
244
216
- # Custom cleanup function
217
- def safe_rmtree ( path ):
218
- if os . path . exists ( path ) :
219
- shutil . rmtree ( path )
220
-
221
- table_with_uast = get_uast_parquet ( )
222
- # report statistics
223
- stats = { "source_documents" : table . num_columns , "result_documents" : table_with_uast . num_columns }
245
+ table_with_uast = get_uast_parquet ( table )
246
+
247
+ try :
248
+ # Use an OS command to remove the folder and its contents
249
+ subprocess . run ([ "rm" , "-rf" , self . bindings_dir ], check = True )
250
+ print ( f"Successfully deleted: { self . bindings_dir } " )
251
+ except subprocess . CalledProcessError as e :
252
+ print ( f"Error deleting { self . bindings_dir } : { e } " )
224
253
225
254
## Semantic profiling
226
- table = table_with_uast
227
- self .logger .debug (f"Semantic profiling of one table with { len (table )} rows" )
255
+ self .logger .debug (f"Semantic profiling of one table with { len (table_with_uast )} rows" )
228
256
229
257
# Load Knowledge Base
258
+ print (self .ikb_file )
259
+ print (self .null_libs_file )
230
260
ikb = knowledge_base (self .ikb_file , self .null_libs_file )
231
261
ikb .load_ikb_trie ()
232
262
233
263
# Extract concept from IKB
234
- libraries = table .column ('UAST_Package_List' ).to_pylist ()
235
- language = table .column ('Language ' ).to_pylist ()
264
+ libraries = table_with_uast .column ('UAST_Package_List' ).to_pylist ()
265
+ language = table_with_uast .column ('language ' ).to_pylist ()
236
266
concepts = [concept_extractor (lib , lang , ikb ) for lib , lang in zip (libraries , language )]
237
267
238
268
# Append concepts column to table and record unknown libraries
239
269
new_col = pa .array (concepts )
240
- table = table .append_column ('Concepts' , new_col )
270
+ table_with_uast = table_with_uast .append_column ('Concepts' , new_col )
241
271
ikb .write_null_files ()
242
272
243
273
# Higher order syntactic profiler
244
- self .logger .debug (f"Transforming one table with { len (table )} rows" )
274
+ self .logger .debug (f"Transforming one table with { len (table_with_uast )} rows" )
245
275
246
276
if self .metrics_list is not None :
247
- for metric in self .metrics_list :
248
- if metric == "CCR" :
249
- self .logger .info (f"Generating { metric } values" )
250
- uasts = [uast_read (uast_json ) for uast_json in table ['UAST' ].to_pylist ()]
251
- ccrs = [extract_ccr (uast ) for uast in uasts ]
252
- new_table = table .append_column (metric , pa .array (ccrs ))
253
-
254
- self .logger .debug (f"Transformed one table with { len (new_table )} rows" )
255
- metadata = {"nfiles" : 1 , "nrows" : len (new_table )}
256
-
277
+ uasts = [uast_read (uast_json ) for uast_json in table_with_uast ['UAST' ].to_pylist ()]
278
+ ccrs = []
279
+ code_snippet_len = []
280
+ avg_fn_len_in_snippet = []
281
+
282
+ for uast in uasts :
283
+ if "CCR" in self .metrics_list :
284
+ ccrs .append (extract_ccr (uast ))
285
+ if "code_snippet_len" in self .metrics_list :
286
+ code_snippet_len .append (extract_code_snippet_length (uast ))
287
+ if "avg_fn_len_in_snippet" in self .metrics_list :
288
+ avg_fn_len_in_snippet .append (extract_code_avg_fn_len_in_snippet (uast ))
289
+
290
+ if "CCR" in self .metrics_list :
291
+ table_with_uast = table_with_uast .append_column ("CCR" , pa .array (ccrs ))
292
+ if "code_snippet_len" in self .metrics_list :
293
+ table_with_uast = table_with_uast .append_column ("code_snippet_len" , pa .array (code_snippet_len ))
294
+ if "avg_fn_len_in_snippet" in self .metrics_list :
295
+ table_with_uast = table_with_uast .append_column ("avg_fn_len_in_snippet" , pa .array (avg_fn_len_in_snippet ))
296
+
297
+ self .logger .debug (f"Transformed one table with { len (table_with_uast )} rows" )
298
+ metadata = {"nfiles" : 1 , "nrows" : len (table_with_uast )}
257
299
# Report generation
258
- if 'UAST' in new_table .schema .names and 'Concepts' in new_table .schema .names :
259
- generate_report (new_table ,self .metrics_list )
300
+ if 'UAST' in table_with_uast .schema .names and 'Concepts' in table_with_uast .schema .names :
301
+ generate_report (table_with_uast ,self .metrics_list )
260
302
261
303
# Add some sample metadata.
262
- self .logger .debug (f"Transformed one table with { len (table )} rows" )
263
- stats ["nrows" ] = len (table )
264
-
265
- try :
266
- # Use an OS command to remove the folder and its contents
267
- subprocess .run (["rm" , "-rf" , self .bindings_dir ], check = True )
268
- print (f"Successfully deleted: { self .bindings_dir } " )
269
- except subprocess .CalledProcessError as e :
270
- print (f"Error deleting { self .bindings_dir } : { e } " )
271
-
272
- return [table ], stats
304
+ self .logger .debug (f"Transformed one table with { len (table_with_uast )} rows" )
305
+ # report statistics
306
+ stats = {"source_documents" : table .num_columns , "result_documents" : table_with_uast .num_columns }
307
+ return [table_with_uast ], stats
273
308
274
309
class CodeProfilerTransformConfiguration (TransformConfiguration ):
275
310
def __init__ (self , transform_class : type [AbstractBinaryTransform ] = CodeProfilerTransform ):
@@ -279,15 +314,15 @@ def __init__(self, transform_class: type[AbstractBinaryTransform] = CodeProfiler
279
314
)
280
315
def add_input_params (self , parser : ArgumentParser ) -> None :
281
316
parser .add_argument (
282
- f"--{ language_cli_param } " ,
317
+ f"--{ language } " ,
283
318
type = str ,
284
- default = "Language " ,
319
+ default = "language " ,
285
320
help = "Column name that denotes the programming language" ,
286
321
)
287
322
parser .add_argument (
288
- f"--{ contents_cli_param } " ,
323
+ f"--{ contents } " ,
289
324
type = str ,
290
- default = "Contents " ,
325
+ default = "contents " ,
291
326
help = "Column name that contains code snippets" ,
292
327
)
293
328
0 commit comments