From 44cc26bbc6bb0d41f624c9198441c9466a4ebe61 Mon Sep 17 00:00:00 2001 From: Curtis Rueden Date: Mon, 28 Mar 2022 15:21:48 -0500 Subject: [PATCH 1/6] WIP: Skeleton for ImageJ tutorial ingestion --- _search/server/tutorials.py | 86 +++++++++++++++++++++++++++++++++++++ 1 file changed, 86 insertions(+) create mode 100644 _search/server/tutorials.py diff --git a/_search/server/tutorials.py b/_search/server/tutorials.py new file mode 100644 index 0000000000..acf7d0c76a --- /dev/null +++ b/_search/server/tutorials.py @@ -0,0 +1,86 @@ +#!/bin/env python + +# Parse ImageJ tutorials into documents for +# use with their own searchable collection. + +import logging, os, traceback +import yaml +from parseutil import first_sentence + + +logger = logging.getLogger(__name__) + + +def is_imagej_tutorials(root): + java = os.path.join(root, 'java') + notebooks = os.path.join(root, 'notebooks') + return os.path.isdir(java) and os.path.isdir(notebooks) + + +def parse_java_source(root, path): + logger.debug(f'Parsing Java source file {path}...') + + with open(path) as f: + lines = json.read(f) + + # This is dumb -- do we want to do better? + doc = {} + doc['content'] = ''.join(lines) + + return doc + + +def parse_notebook(root, path): + logger.debug(f'Parsing notebook {path}...') + + with open(path) as f: + data = json.read(f) + + doc = {} + doc['content'] = '' + for cell in data['cells']: + # TODO: implement process_cell: extract source and output(s) if present + doc['content'] += process_cell(cell) + + return doc + + +def find_resources(root, suffix): + # TODO: use pathlib to find all .java or .ipynb (based on suffix) inside root. + pass + + +def load_imagej_tutorials(root): + """ + Loads the content from the given imagej/tutorials folder. + See: https://github.com/imagej/tutorials + """ + java = os.path.join(siteroot, 'java') + notebooks = os.path.join(siteroot, 'notebooks') + if not os.path.isdir(java) or not os.path.isdir(notebooks): + raise ValueError(f'The path {siteroot} does not appear to be a Jekyll site.') + + logger.info('Loading content...') + documents = [] + + for javafile in find_resources(java, '.java'): + try: + doc = parse_java_source(root, path) + if doc: + documents.append(doc) + except: + logger.error(f'Failed to parse {path}:') + traceback.print_exc() + logger.info(f'Loaded {len(documents)} documents from Java source files') + + for nbfile in find_resources(notebooks, '.ipynb'): + try: + doc = parse_notebook(root, path) + if doc: + documents.append(doc) + except: + logger.error(f'Failed to parse {path}:') + traceback.print_exc() + logger.info(f'Loaded {len(documents)} documents from Jupyter notebooks') + + return documents From 230674f490d2f73adbbeb486b474b30d9ce9c681 Mon Sep 17 00:00:00 2001 From: Curtis Rueden Date: Mon, 28 Mar 2022 15:30:25 -0500 Subject: [PATCH 2/6] Switch to pathlib --- _search/server/tutorials.py | 31 +++++++++++++------------------ 1 file changed, 13 insertions(+), 18 deletions(-) diff --git a/_search/server/tutorials.py b/_search/server/tutorials.py index acf7d0c76a..ec7b295524 100644 --- a/_search/server/tutorials.py +++ b/_search/server/tutorials.py @@ -3,7 +3,7 @@ # Parse ImageJ tutorials into documents for # use with their own searchable collection. -import logging, os, traceback +import logging, traceback import yaml from parseutil import first_sentence @@ -12,12 +12,12 @@ def is_imagej_tutorials(root): - java = os.path.join(root, 'java') - notebooks = os.path.join(root, 'notebooks') - return os.path.isdir(java) and os.path.isdir(notebooks) + java = Path(root) / 'java' + notebooks = Path(root) / 'notebooks' + return java.isdir() and notebooks.isdir() -def parse_java_source(root, path): +def parse_java_source(path): logger.debug(f'Parsing Java source file {path}...') with open(path) as f: @@ -30,7 +30,7 @@ def parse_java_source(root, path): return doc -def parse_notebook(root, path): +def parse_notebook(path): logger.debug(f'Parsing notebook {path}...') with open(path) as f: @@ -45,27 +45,22 @@ def parse_notebook(root, path): return doc -def find_resources(root, suffix): - # TODO: use pathlib to find all .java or .ipynb (based on suffix) inside root. - pass - - def load_imagej_tutorials(root): """ Loads the content from the given imagej/tutorials folder. See: https://github.com/imagej/tutorials """ - java = os.path.join(siteroot, 'java') - notebooks = os.path.join(siteroot, 'notebooks') - if not os.path.isdir(java) or not os.path.isdir(notebooks): + java = Path(root) / 'java' + notebooks = Path(root) / 'notebooks' + if not java.isdir() or not notebooks.isdir(): raise ValueError(f'The path {siteroot} does not appear to be a Jekyll site.') logger.info('Loading content...') documents = [] - for javafile in find_resources(java, '.java'): + for javafile in java.rglob("**/*.java"): try: - doc = parse_java_source(root, path) + doc = parse_java_source(javafile) if doc: documents.append(doc) except: @@ -73,9 +68,9 @@ def load_imagej_tutorials(root): traceback.print_exc() logger.info(f'Loaded {len(documents)} documents from Java source files') - for nbfile in find_resources(notebooks, '.ipynb'): + for nbfile in notebooks.rglob("**/*.ipynb"): try: - doc = parse_notebook(root, path) + doc = parse_notebook(nbfile) if doc: documents.append(doc) except: From 33faba750d2ca69f6f9db946359cf4a2f957ae06 Mon Sep 17 00:00:00 2001 From: jackrueth Date: Mon, 28 Mar 2022 15:31:45 -0500 Subject: [PATCH 3/6] Start fixing the imports --- _search/server/tutorials.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/_search/server/tutorials.py b/_search/server/tutorials.py index ec7b295524..3a32b4cc4f 100644 --- a/_search/server/tutorials.py +++ b/_search/server/tutorials.py @@ -4,8 +4,9 @@ # use with their own searchable collection. import logging, traceback -import yaml +import json from parseutil import first_sentence +from pathlib import Path logger = logging.getLogger(__name__) From 6f3ed7a517a5029d77c4a97d7ee87e8002d192b7 Mon Sep 17 00:00:00 2001 From: jackrueth Date: Mon, 28 Mar 2022 16:57:26 -0500 Subject: [PATCH 4/6] Add new elements to tutorials.py --- _search/server/index-sites.py | 3 +++ _search/server/tutorials.py | 12 +++++++++--- 2 files changed, 12 insertions(+), 3 deletions(-) diff --git a/_search/server/index-sites.py b/_search/server/index-sites.py index 8d500bfe99..7a757b4795 100644 --- a/_search/server/index-sites.py +++ b/_search/server/index-sites.py @@ -2,6 +2,7 @@ import logging, os, sys import jekyll, ijsite, tsutil +import tutorials logger = logging.getLogger('indexer') @@ -12,6 +13,8 @@ def load_site(siteroot): return jekyll.load_jekyll_site(siteroot) if ijsite.is_imagej_website(siteroot): return ijsite.load_site(siteroot) + if tutorials.is_imagej_tutorials(siteroot): + return tutorials.load_imagej_tutorials(siteroot) return None diff --git a/_search/server/tutorials.py b/_search/server/tutorials.py index 3a32b4cc4f..55262eaa48 100644 --- a/_search/server/tutorials.py +++ b/_search/server/tutorials.py @@ -45,6 +45,9 @@ def parse_notebook(path): return doc +def process_cell(cell): + return type(cell) + def load_imagej_tutorials(root): """ @@ -54,7 +57,7 @@ def load_imagej_tutorials(root): java = Path(root) / 'java' notebooks = Path(root) / 'notebooks' if not java.isdir() or not notebooks.isdir(): - raise ValueError(f'The path {siteroot} does not appear to be a Jekyll site.') + raise ValueError(f'The path {root} does not appear to be a Jekyll site.') logger.info('Loading content...') documents = [] @@ -65,7 +68,7 @@ def load_imagej_tutorials(root): if doc: documents.append(doc) except: - logger.error(f'Failed to parse {path}:') + logger.error(f'Failed to parse {Path}:') traceback.print_exc() logger.info(f'Loaded {len(documents)} documents from Java source files') @@ -75,8 +78,11 @@ def load_imagej_tutorials(root): if doc: documents.append(doc) except: - logger.error(f'Failed to parse {path}:') + logger.error(f'Failed to parse {Path}:') traceback.print_exc() logger.info(f'Loaded {len(documents)} documents from Jupyter notebooks') return documents + +def main(): + print("Hello") From 6ea51f656469b6f49b7d8a8e27724251f30fc933 Mon Sep 17 00:00:00 2001 From: jackrueth Date: Thu, 31 Mar 2022 15:31:09 -0500 Subject: [PATCH 5/6] Add comments to process_cell for implementation --- _search/server/tutorials.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/_search/server/tutorials.py b/_search/server/tutorials.py index 55262eaa48..32fd036b03 100644 --- a/_search/server/tutorials.py +++ b/_search/server/tutorials.py @@ -46,6 +46,9 @@ def parse_notebook(path): return doc def process_cell(cell): + # 2 cases: java file or a notebook + # case 1: notebook -> need info inside cells and then info from output lines + # case 2: java file -> need class name and class javadoc for description return type(cell) From b5a8f157505f50825853af5d85fac4a6dc9b9bac Mon Sep 17 00:00:00 2001 From: jackrueth Date: Thu, 31 Mar 2022 16:51:28 -0500 Subject: [PATCH 6/6] Add new coments to process_cell --- _search/server/tutorials.py | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/_search/server/tutorials.py b/_search/server/tutorials.py index 32fd036b03..b192a79692 100644 --- a/_search/server/tutorials.py +++ b/_search/server/tutorials.py @@ -22,7 +22,7 @@ def parse_java_source(path): logger.debug(f'Parsing Java source file {path}...') with open(path) as f: - lines = json.read(f) + lines = json.load(f) # This is dumb -- do we want to do better? doc = {} @@ -35,7 +35,7 @@ def parse_notebook(path): logger.debug(f'Parsing notebook {path}...') with open(path) as f: - data = json.read(f) + data = json.load(f) doc = {} doc['content'] = '' @@ -45,10 +45,15 @@ def parse_notebook(path): return doc +# type of cell is dict for reference +# 2 cases: java file or a notebook +# case 1: notebook -> need info inside cells and then info from output lines +# case 2: java file -> need class name and class javadoc for description def process_cell(cell): - # 2 cases: java file or a notebook - # case 1: notebook -> need info inside cells and then info from output lines - # case 2: java file -> need class name and class javadoc for description + # case 1: notebook + + # case 2: java files + return type(cell)