Skip to content

Commit

Permalink
minor update
Browse files Browse the repository at this point in the history
  • Loading branch information
XiangpengHao committed Jan 11, 2025
1 parent fd76858 commit 687a53c
Show file tree
Hide file tree
Showing 6 changed files with 50 additions and 34 deletions.
20 changes: 10 additions & 10 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

4 changes: 2 additions & 2 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -39,11 +39,11 @@ zstd-sys = { version = "=2.0.9", default-features = false }
serde = { version = "1.0" }

# Use our fork until https://github.com/apache/opendal/pull/5530
opendal = { git = "https://github.com/XiangpengHao/opendal.git", default-features = false, features = [
opendal = { git = "https://github.com/XiangpengHao/opendal.git", default-features = false, rev = "8b6ea6", features = [
"services-s3",
"services-http",
] }
object_store_opendal = { git = "https://github.com/XiangpengHao/opendal.git", features = [
object_store_opendal = { git = "https://github.com/XiangpengHao/opendal.git", rev = "8b6ea6", features = [
"send_wrapper",
] }

Expand Down
5 changes: 5 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,11 @@ Online at: https://parquet-viewer.xiangpeng.systems
For example, [`parquet-viewer.xiangpeng.systems/?url=https://raw.githubusercontent.com/tobilg/public-cloud-provider-ip-ranges/main/data/providers/all.parquet`](https://parquet-viewer.xiangpeng.systems/?url=https://raw.githubusercontent.com/tobilg/public-cloud-provider-ip-ranges/main/data/providers/all.parquet) will load the file from github.
`parquet-viewer` is smart enough to only download the data that is relevant to your query, usually a few KBs, even if the file is large.

- You can use `parquet-viewer.py` in `utils` to open a local file. Only works on Chrome or Firefox (not Safari).
```bash
./parquet-viewer.py /path/to/your/file.parquet
```



## Development
Expand Down
11 changes: 8 additions & 3 deletions src/file_reader.rs
Original file line number Diff line number Diff line change
@@ -1,9 +1,10 @@
use std::sync::Arc;
use std::sync::{Arc, LazyLock};

use datafusion::execution::object_store::ObjectStoreUrl;
use leptos::prelude::*;
use leptos::wasm_bindgen::{prelude::Closure, JsCast};
use leptos_router::hooks::query_signal;
use object_store::memory::InMemory;
use object_store::path::Path;
use object_store::{ObjectStore, PutPayload};
use object_store_opendal::OpendalStore;
Expand All @@ -12,7 +13,10 @@ use parquet::arrow::async_reader::{AsyncFileReader, ParquetObjectReader};
use url::Url;
use web_sys::js_sys;

use crate::{ParquetTable, INMEMORY_STORE, SESSION_CTX};
use crate::{ParquetTable, SESSION_CTX};

pub(crate) static INMEMORY_STORE: LazyLock<Arc<InMemory>> =
LazyLock::new(|| Arc::new(InMemory::new()));

const S3_ENDPOINT_KEY: &str = "s3_endpoint";
const S3_ACCESS_KEY_ID_KEY: &str = "s3_access_key_id";
Expand Down Expand Up @@ -45,7 +49,7 @@ pub fn FileReader(
set_error_message: WriteSignal<Option<String>>,
set_parquet_table: WriteSignal<Option<ParquetTable>>,
) -> impl IntoView {
let (active_tab, set_active_tab) = signal("url".to_string());
let (active_tab, set_active_tab) = signal("file".to_string());

let (url_query, set_url_query) = query_signal::<String>("url");
let default_url = {
Expand Down Expand Up @@ -258,6 +262,7 @@ pub fn FileReader(
Some(url) => {
// user provided an url, set it and run it.
set_url.set(url);
set_active_tab.set("url".to_string());
on_url_submit();
}
None => set_url.set(DEFAULT_URL.to_string()),
Expand Down
6 changes: 1 addition & 5 deletions src/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -5,13 +5,12 @@ use datafusion::{
physical_plan::ExecutionPlan,
prelude::{SessionConfig, SessionContext},
};
use file_reader::FileReader;
use file_reader::{FileReader, INMEMORY_STORE};
use leptos_router::{
components::Router,
hooks::{query_signal, use_query_map},
};

use object_store::memory::InMemory;
use query_results::{export_to_csv_inner, export_to_parquet_inner, QueryResult, QueryResultView};
use schema::SchemaSection;

Expand All @@ -38,9 +37,6 @@ use query_input::{execute_query_inner, QueryInput};
mod settings;
use settings::Settings;

pub(crate) static INMEMORY_STORE: LazyLock<Arc<InMemory>> =
LazyLock::new(|| Arc::new(InMemory::new()));

pub(crate) static SESSION_CTX: LazyLock<Arc<SessionContext>> = LazyLock::new(|| {
let mut config = SessionConfig::new();
config.options_mut().sql_parser.dialect = "PostgreSQL".to_string();
Expand Down
38 changes: 24 additions & 14 deletions utils/parquet-viewer.py
100644 → 100755
Original file line number Diff line number Diff line change
@@ -1,25 +1,32 @@
#!/usr/bin/env python3
# encoding: utf-8

from http.server import HTTPServer, SimpleHTTPRequestHandler
from http.server import HTTPServer, BaseHTTPRequestHandler
from socketserver import ThreadingMixIn
import argparse
import os
import webbrowser


class CORSRequestHandler(SimpleHTTPRequestHandler):
class ThreadedHTTPServer(ThreadingMixIn, HTTPServer):
daemon_threads = True


class CORSRequestHandler(BaseHTTPRequestHandler):
def __init__(self, *args, file_path=None, **kwargs):
self.file_path = file_path
super().__init__(*args, **kwargs)

def log_message(self, format, *args):
pass

def end_headers(self):
# Add CORS headers to allow all origins
self.send_header('Access-Control-Allow-Origin', '*')
self.send_header('Access-Control-Allow-Methods', 'GET, HEAD, OPTIONS')
self.send_header('Access-Control-Allow-Headers', '*')
super().end_headers()

def do_OPTIONS(self):
# Handle preflight requests
self.send_response(200)
self.end_headers()

Expand Down Expand Up @@ -58,6 +65,7 @@ def do_GET(self):
self.send_header('Content-Length', str(content_length))
self.send_header('Content-type', 'application/vnd.apache.parquet')
self.send_header('Content-Disposition', f'attachment; filename="{os.path.basename(self.file_path)}"')
self.send_header("Connection", "keep-alive")
self.end_headers()

with open(self.file_path, 'rb') as f:
Expand All @@ -73,11 +81,6 @@ def do_GET(self):
self.send_error(500, f"Internal server error: {str(e)}")

def do_HEAD(self):
expected_path = '/' + os.path.basename(self.file_path)
if self.path != expected_path:
self.send_error(404, f"File not found. Please use {expected_path}")
return

try:
file_size = os.path.getsize(self.file_path)
self.send_response(200)
Expand All @@ -92,17 +95,24 @@ def do_HEAD(self):
self.send_error(500, f"Internal server error: {str(e)}")

def main():
parser = argparse.ArgumentParser(description='Start a CORS-enabled HTTP server')
parser = argparse.ArgumentParser(description='Open a local parquet file in parquet-viewer')
parser.add_argument('file', type=str,
help='Path to the parquet file')
parser.add_argument('-p', '--port', type=int, default=8003,
help='Port to run the server on (default: 8003)')
parser.add_argument('-f', '--file', type=str, required=True,
help='Path to the file to serve')
parser.add_argument('--no-open', action='store_true',
help='Do not open the browser automatically')
args = parser.parse_args()

file_name = os.path.basename(args.file)
handler = lambda *handler_args: CORSRequestHandler(*handler_args, file_path=args.file)
httpd = HTTPServer(('127.0.0.1', args.port), handler)
print(f'Open in your browser: https://parquet-viewer.xiangpeng.systems/?url=http://127.0.0.1:{args.port}/{file_name}')
httpd = ThreadedHTTPServer(('127.0.0.1', args.port), handler)
url = f'https://parquet-viewer.xiangpeng.systems/?url=http://127.0.0.1:{args.port}/{file_name}'
print(f'Opening in your browser:\n{url}')

if not args.no_open:
webbrowser.open(url)

httpd.serve_forever()


Expand Down

0 comments on commit 687a53c

Please sign in to comment.