Skip to content

Commit d4e5870

Browse files
committed
parser: Update nginx to better match F=1 behavior
With testcase from mirror.ghettoforge.org/distributions/gf/
1 parent 6253924 commit d4e5870

File tree

5 files changed

+135
-34
lines changed

5 files changed

+135
-34
lines changed

fixtures/ghettoforge/index.html

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,18 @@
1+
<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 3.2 Final//EN">
2+
<html>
3+
<head>
4+
<title>Index of /distributions/gf</title>
5+
</head>
6+
<body>
7+
<h1>Index of /distributions/gf</h1>
8+
<pre><img src="/icons/blank.gif" alt="Icon "> <a href="?C=N;O=D">Name</a> <a href="?C=M;O=A">Last modified</a> <a href="?C=S;O=A">Size</a> <a href="?C=D;O=A">Description</a><hr><img src="/icons/back.gif" alt="[PARENTDIR]"> <a href="/distributions/">Parent Directory</a> -
9+
<img src="/icons/unknown.gif" alt="[ ]"> <a href="RPM-GPG-KEY-gf.el7">RPM-GPG-KEY-gf.el7</a> 2014-12-30 02:53 3.0K
10+
<img src="/icons/unknown.gif" alt="[ ]"> <a href="RPM-GPG-KEY-gf.el8">RPM-GPG-KEY-gf.el8</a> 2020-01-13 09:40 3.1K
11+
<img src="/icons/unknown.gif" alt="[ ]"> <a href="RPM-GPG-KEY-gf.el9">RPM-GPG-KEY-gf.el9</a> 2022-08-03 11:28 1.6K
12+
<img src="/icons/folder.gif" alt="[DIR]"> <a href="archive/">archive/</a> 2020-12-21 02:34 -
13+
<img src="/icons/folder.gif" alt="[DIR]"> <a href="el/">el/</a> 2022-08-02 11:57 -
14+
<img src="/icons/unknown.gif" alt="[ ]"> <a href="gf-release-latest.gf.el7.noarch.rpm">gf-release-latest.gf.el7.noarch.rpm</a> 2021-08-21 10:38 8.0K
15+
<img src="/icons/unknown.gif" alt="[ ]"> <a href="gf-release-latest.gf.el8.noarch.rpm">gf-release-latest.gf.el8.noarch.rpm</a> 2021-08-21 10:39 11K
16+
<img src="/icons/unknown.gif" alt="[ ]"> <a href="gf-release-latest.gf.el9.noarch.rpm">gf-release-latest.gf.el9.noarch.rpm</a> 2022-08-03 12:16 9.2K
17+
<hr></pre>
18+
</body></html>

src/parser/README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@ This is a list of parsers that tsumugu supports:
66
- directory_lister: [Directory Lister](https://www.directorylister.com/).
77
- docker: A specialized parser for <https://download.docker.com/>.
88
- lighttpd: [lighttpd's mod_dirlisting](https://redmine.lighttpd.net/projects/lighttpd/wiki/Docs_ModDirlisting).
9-
- nginx: [Nginx's autoindex](https://nginx.org/en/docs/http/ngx_http_autoindex_module.html).
9+
- nginx: [Nginx's autoindex](https://nginx.org/en/docs/http/ngx_http_autoindex_module.html). It should also work with Apache2's autoindex `F=1` mode.
1010
- caddy: [Caddy's file_server](https://caddyserver.com/docs/caddyfile/directives/file_server).
1111
- fancyindex: [Nginx fancyindex](https://github.com/aperezdc/ngx-fancyindex).
1212

src/parser/fancyindex.rs

Lines changed: 1 addition & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -13,14 +13,6 @@ use scraper::{Html, Selector};
1313
#[derive(Debug, Clone, Default)]
1414
pub struct FancyIndexListingParser;
1515

16-
fn guess_date_fmt(date: &str) -> String {
17-
let two_colons = contains_two_colons(date);
18-
let abbr_month = contains_abbreviated_month(date);
19-
let dfmt = if abbr_month { "%Y-%b-%d" } else { "%Y-%m-%d" };
20-
let tfmt = if two_colons { "%H:%M:%S" } else { "%H:%M" };
21-
format!("{} {}", dfmt, tfmt)
22-
}
23-
2416
impl Parser for FancyIndexListingParser {
2517
fn get_list(&self, client: &Client, url: &Url) -> Result<ListResult> {
2618
let resp = get(client, url.clone())?;
@@ -56,7 +48,7 @@ impl Parser for FancyIndexListingParser {
5648
let date = date.trim();
5749

5850
// decide (guess) which time format to use
59-
let date_fmt = guess_date_fmt(date);
51+
let (date_fmt, _) = guess_date_fmt(date);
6052
let date = NaiveDateTime::parse_from_str(date, &date_fmt)?;
6153

6254
items.push(ListItem::new(
@@ -151,9 +143,4 @@ mod tests {
151143
_ => unreachable!(),
152144
}
153145
}
154-
155-
#[test]
156-
fn test_guess_date_fmt() {
157-
assert_eq!(guess_date_fmt("2024-Jul-15 09:46"), "%Y-%b-%d %H:%M");
158-
}
159146
}

src/parser/mod.rs

Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -80,3 +80,45 @@ fn contains_abbreviated_month(s: &str) -> bool {
8080
fn contains_two_colons(s: &str) -> bool {
8181
s.matches(':').count() >= 2
8282
}
83+
84+
fn has_numeric_prefix(s: &str) -> bool {
85+
s.chars().take(4).all(|c| c.is_ascii_digit()) && s.len() >= 4
86+
}
87+
88+
// Returns format and regex string
89+
fn guess_date_fmt(date: &str) -> (String, String) {
90+
let two_colons = contains_two_colons(date);
91+
let abbr_month = contains_abbreviated_month(date);
92+
let year_first = has_numeric_prefix(date);
93+
let (dfmt, dfmt_regex) = match (abbr_month, year_first) {
94+
(true, true) => ("%Y-%b-%d", r"\d{4}-\w{3}-\d{2}"),
95+
(true, false) => ("%d-%b-%Y", r"\d{2}-\w{3}-\d{4}"),
96+
(false, true) => ("%Y-%m-%d", r"\d{4}-\d{2}-\d{2}"),
97+
(false, false) => ("%d-%m-%Y", r"\d{2}-\d{2}-\d{4}"),
98+
};
99+
let (tfmt, tfmt_regex) = if two_colons {
100+
("%H:%M:%S", r"\d{2}:\d{2}:\d{2}")
101+
} else {
102+
("%H:%M", r"\d{2}:\d{2}")
103+
};
104+
(
105+
format!("{} {}", dfmt, tfmt),
106+
format!("{} {}", dfmt_regex, tfmt_regex),
107+
)
108+
}
109+
110+
#[cfg(test)]
111+
mod tests {
112+
use super::*;
113+
114+
#[test]
115+
fn test_guess_date_fmt() {
116+
assert_eq!(
117+
guess_date_fmt("2024-Jul-15 09:46"),
118+
(
119+
"%Y-%b-%d %H:%M".to_owned(),
120+
r"\d{4}-\w{3}-\d{2} \d{2}:\d{2}".to_owned()
121+
)
122+
);
123+
}
124+
}

src/parser/nginx.rs

Lines changed: 73 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -11,19 +11,8 @@ use super::*;
1111
use anyhow::{anyhow, Result};
1212
use regex::Regex;
1313

14-
#[derive(Debug, Clone)]
15-
pub struct NginxListingParser {
16-
metadata_regex: Regex,
17-
}
18-
19-
impl Default for NginxListingParser {
20-
fn default() -> Self {
21-
Self {
22-
metadata_regex: Regex::new(r"(\d{2}-\w{3}-\d{4} \d{2}:\d{2})\s+([\d\.\-kMG]+)$")
23-
.unwrap(),
24-
}
25-
}
26-
}
14+
#[derive(Debug, Clone, Default)]
15+
pub struct NginxListingParser {}
2716

2817
impl Parser for NginxListingParser {
2918
fn get_list(&self, client: &reqwest::blocking::Client, url: &url::Url) -> Result<ListResult> {
@@ -34,11 +23,17 @@ impl Parser for NginxListingParser {
3423
let document = Html::parse_document(&body);
3524
let selector = Selector::parse("a").unwrap();
3625
let mut items = Vec::new();
26+
let mut date_fmt = None;
27+
let mut date_regex = None;
3728
for element in document.select(&selector) {
3829
let href = match element.value().attr("href") {
3930
Some(href) => href,
4031
None => continue,
4132
};
33+
if href.starts_with('?') {
34+
// Apache autoindex commands, skip.
35+
continue;
36+
}
4237
// It's not proper to get filename by <a> text
4338
// As when it is too long, this could happen:
4439
// ceph-immutable-object-cache_17.2.6-pve1+3_amd64..> 03-May-2023 23:52 150048
@@ -55,6 +50,11 @@ impl Parser for NginxListingParser {
5550
if name == ".." {
5651
continue;
5752
}
53+
// extra check for Apache server
54+
let inner = element.inner_html();
55+
if inner == "Parent Directory" {
56+
continue;
57+
}
5858
let type_ = if href.as_str().ends_with('/') {
5959
FileType::Directory
6060
} else {
@@ -69,12 +69,25 @@ impl Parser for NginxListingParser {
6969
.to_string();
7070
let metadata_raw = metadata_raw.trim();
7171
debug!("{:?}", metadata_raw);
72-
let metadata = self.metadata_regex.captures(metadata_raw).ok_or(anyhow!(
73-
"Get '{}' for metadata, is this a nginx page?",
74-
metadata_raw
75-
))?;
72+
// guess date format...
73+
if date_fmt.is_none() {
74+
let (f, r) = guess_date_fmt(metadata_raw);
75+
date_fmt = Some(f);
76+
date_regex = Some(Regex::new(&format!(r"({})\s+([\d\.\-kKMG]+)$", r))?);
77+
debug!("date_fmt: {:?} date_regex: {:?}", date_fmt, date_regex)
78+
}
79+
let metadata = date_regex
80+
.clone()
81+
.unwrap()
82+
.captures(metadata_raw)
83+
.ok_or(anyhow!(
84+
"Get '{}' for {} ({}) metadata, is this a nginx page?",
85+
metadata_raw,
86+
name,
87+
href
88+
))?;
7689
let date = metadata.get(1).unwrap().as_str();
77-
let date = NaiveDateTime::parse_from_str(date, "%d-%b-%Y %H:%M")?;
90+
let date = NaiveDateTime::parse_from_str(date, &date_fmt.clone().unwrap())?;
7891
let size = metadata.get(2).unwrap().as_str();
7992
debug!("{} {} {:?} {} {:?}", href, name, type_, date, size);
8093
items.push(ListItem::new(
@@ -84,7 +97,11 @@ impl Parser for NginxListingParser {
8497
{
8598
if size == "-" {
8699
None
87-
} else if size.contains('k') || size.contains('M') || size.contains('G') {
100+
} else if size.contains('k')
101+
|| size.contains('K')
102+
|| size.contains('M')
103+
|| size.contains('G')
104+
{
88105
let (n_size, unit) = FileSize::get_humanized(size);
89106
Some(FileSize::HumanizedBinary(n_size, unit))
90107
} else {
@@ -101,8 +118,11 @@ impl Parser for NginxListingParser {
101118

102119
#[cfg(test)]
103120
mod tests {
121+
use test_log::test;
104122
use url::Url;
105123

124+
use crate::listing::SizeUnit;
125+
106126
use super::*;
107127

108128
#[test]
@@ -182,4 +202,38 @@ mod tests {
182202
_ => unreachable!(),
183203
}
184204
}
205+
206+
#[test]
207+
fn test_ghettoforge() {
208+
let client = reqwest::blocking::Client::new();
209+
let items = NginxListingParser::default()
210+
.get_list(
211+
&client,
212+
&url::Url::parse("http://localhost:1921/ghettoforge").unwrap(),
213+
)
214+
.unwrap();
215+
match items {
216+
ListResult::List(items) => {
217+
assert_eq!(items.len(), 8);
218+
assert_eq!(items[0].name, "RPM-GPG-KEY-gf.el7");
219+
assert_eq!(items[0].type_, FileType::File);
220+
assert_eq!(
221+
items[0].size,
222+
Some(FileSize::HumanizedBinary(3.0, SizeUnit::K))
223+
);
224+
assert_eq!(
225+
items[0].mtime,
226+
NaiveDateTime::parse_from_str("2014-12-30 02:53", "%Y-%m-%d %H:%M").unwrap()
227+
);
228+
assert_eq!(items[3].name, "archive");
229+
assert_eq!(items[3].type_, FileType::Directory);
230+
assert_eq!(items[3].size, None);
231+
assert_eq!(
232+
items[3].mtime,
233+
NaiveDateTime::parse_from_str("2020-12-21 02:34", "%Y-%m-%d %H:%M").unwrap()
234+
);
235+
}
236+
_ => unreachable!(),
237+
}
238+
}
185239
}

0 commit comments

Comments
 (0)