Skip to content

Commit

Permalink
Update columns to push to HBase tables (#673)
Browse files Browse the repository at this point in the history
* Start to think about datastructure

* Update columns to be pushed to HBase

* Do not push features

* PEP8
  • Loading branch information
JulienPeloton authored Jan 25, 2023
1 parent 1188779 commit cf41361
Show file tree
Hide file tree
Showing 2 changed files with 30 additions and 3 deletions.
6 changes: 5 additions & 1 deletion bin/index_archival.py
Original file line number Diff line number Diff line change
Expand Up @@ -118,9 +118,13 @@ def main():
'gcvs',
'vsx',
'snn_snia_vs_nonia', 'snn_sn_vs_all', 'rf_snia_vs_nonia',
'classtar', 'drb', 'ndethist', 'rf_kn_vs_nonkn', 'tracklet'
'classtar', 'drb', 'ndethist', 'rf_kn_vs_nonkn', 'tracklet',
'anomaly_score', 'x4lac', 'x3hsp'
]

common_cols += [col_ for col_ in df.columns if col_.startswith('t2_')]
common_cols += [col_ for col_ in df.columns if col_.startswith('mangrove_')]

if columns[0].startswith('pixel'):
nside = int(columns[0].split('pixel')[1])

Expand Down
27 changes: 25 additions & 2 deletions fink_broker/hbaseUtils.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,9 @@
from fink_broker import __version__ as fbvsn
from fink_science import __version__ as fsvsn

from fink_science.t2.utilities import T2_COLS
from fink_science.xmatch.utils import MANGROVE_COLS

from fink_broker.tester import spark_unit_tests

def load_hbase_data(catalog: str, rowkey: str) -> DataFrame:
Expand Down Expand Up @@ -161,7 +164,7 @@ def load_science_portal_column_names():
--------
>>> cols_i, cols_d, cols_b = load_science_portal_column_names()
>>> print(len(cols_d))
14
35
"""
# Column family i
cols_i = [
Expand All @@ -188,9 +191,29 @@ def load_science_portal_column_names():
'Plx',
'e_Plx',
'gcvs',
'vsx'
'vsx',
'x4lac',
'x3hsp',
'anomaly_score'
]

# mangrove
cols_d += [
col('mangrove.{}'.format(i)).alias('mangrove_{}'.format(i)) for i in MANGROVE_COLS
]

cols_d += [
col('t2.{}'.format(i)).alias('t2_{}'.format(i)) for i in T2_COLS
]

# cols_d += [
# col('lc_features_g.{}'.format(i)).alias('lc_features_g_{}'.format(i)) for i in FEATURES_COLS
# ]

# cols_d += [
# col('lc_features_r.{}'.format(i)).alias('lc_features_r_{}'.format(i)) for i in FEATURES_COLS
# ]

# Column family binary
cols_b = [
col('cutoutScience.stampData').alias('cutoutScience_stampData'),
Expand Down

0 comments on commit cf41361

Please sign in to comment.