Skip to content

Commit 3772c9e

Browse files
authored
Fix parsing for file names with spaces (#24)
Git does not quote file names containing spaces when generating header lines. However, I misread the Git implementation and thought it used spaces as a name terminator by default, implying that names with spaces would be quoted. Rewrite the the name parsing code to better match Git. Specifically, when both names in a header are not quoted, test all splits to see if any of them produce two equal names. Also account for spaces in file names when parsing file metadata. Finally, allow trailing spaces on file names, which seem to be allowed by Git (although this sounds like a terrible idea to me.)
1 parent b666609 commit 3772c9e

File tree

3 files changed

+135
-31
lines changed

3 files changed

+135
-31
lines changed

README.md

+1
Original file line numberDiff line numberDiff line change
@@ -75,6 +75,7 @@ likely undiscovered bugs.
7575

7676
- Numbers immediately followed by non-numeric characters
7777
- Trailing characters on a line after valid or expected content
78+
- Malformed file header lines (lines that start with `diff --git`)
7879

7980
2. Errors for invalid input are generally more verbose and specific than those
8081
from `git apply`.

gitdiff/file_header.go

+99-23
Original file line numberDiff line numberDiff line change
@@ -172,24 +172,88 @@ func (p *parser) ParseTraditionalFileHeader() (*File, error) {
172172
// If the names in the header do not match because the patch is a rename,
173173
// return an empty default name.
174174
func parseGitHeaderName(header string) (string, error) {
175-
firstName, n, err := parseName(header, -1, 1)
176-
if err != nil {
177-
return "", err
175+
header = strings.TrimSuffix(header, "\n")
176+
if len(header) == 0 {
177+
return "", nil
178178
}
179179

180-
if n < len(header) && (header[n] == ' ' || header[n] == '\t') {
181-
n++
182-
}
180+
var err error
181+
var first, second string
183182

184-
secondName, _, err := parseName(header[n:], -1, 1)
185-
if err != nil {
186-
return "", err
183+
// there are 4 cases to account for:
184+
//
185+
// 1) unquoted unquoted
186+
// 2) unquoted "quoted"
187+
// 3) "quoted" unquoted
188+
// 4) "quoted" "quoted"
189+
//
190+
quote := strings.IndexByte(header, '"')
191+
switch {
192+
case quote < 0:
193+
// case 1
194+
first = header
195+
196+
case quote > 0:
197+
// case 2
198+
first = header[:quote-1]
199+
if !isSpace(header[quote-1]) {
200+
return "", fmt.Errorf("missing separator")
201+
}
202+
203+
second, _, err = parseQuotedName(header[quote:])
204+
if err != nil {
205+
return "", err
206+
}
207+
208+
case quote == 0:
209+
// case 3 or case 4
210+
var n int
211+
first, n, err = parseQuotedName(header)
212+
if err != nil {
213+
return "", err
214+
}
215+
216+
// git accepts multiple spaces after a quoted name, but not after an
217+
// unquoted name, since the name might end with one or more spaces
218+
for n < len(header) && isSpace(header[n]) {
219+
n++
220+
}
221+
if n == len(header) {
222+
return "", nil
223+
}
224+
225+
if header[n] == '"' {
226+
second, _, err = parseQuotedName(header[n:])
227+
if err != nil {
228+
return "", err
229+
}
230+
} else {
231+
second = header[n:]
232+
}
187233
}
188234

189-
if firstName != secondName {
235+
first = trimTreePrefix(first, 1)
236+
if second != "" {
237+
if first == trimTreePrefix(second, 1) {
238+
return first, nil
239+
}
190240
return "", nil
191241
}
192-
return firstName, nil
242+
243+
// at this point, both names are unquoted (case 1)
244+
// since names may contain spaces, we can't use a known separator
245+
// instead, look for a split that produces two equal names
246+
247+
for i := 0; i < len(first)-1; i++ {
248+
if !isSpace(first[i]) {
249+
continue
250+
}
251+
second = trimTreePrefix(first[i+1:], 1)
252+
if name := first[:i]; name == second {
253+
return name, nil
254+
}
255+
}
256+
return "", nil
193257
}
194258

195259
// parseGitHeaderData parses a single line of metadata from a Git file header.
@@ -283,25 +347,25 @@ func parseGitHeaderCreatedMode(f *File, line, defaultName string) error {
283347

284348
func parseGitHeaderCopyFrom(f *File, line, defaultName string) (err error) {
285349
f.IsCopy = true
286-
f.OldName, _, err = parseName(line, -1, 0)
350+
f.OldName, _, err = parseName(line, 0, 0)
287351
return
288352
}
289353

290354
func parseGitHeaderCopyTo(f *File, line, defaultName string) (err error) {
291355
f.IsCopy = true
292-
f.NewName, _, err = parseName(line, -1, 0)
356+
f.NewName, _, err = parseName(line, 0, 0)
293357
return
294358
}
295359

296360
func parseGitHeaderRenameFrom(f *File, line, defaultName string) (err error) {
297361
f.IsRename = true
298-
f.OldName, _, err = parseName(line, -1, 0)
362+
f.OldName, _, err = parseName(line, 0, 0)
299363
return
300364
}
301365

302366
func parseGitHeaderRenameTo(f *File, line, defaultName string) (err error) {
303367
f.IsRename = true
304-
f.NewName, _, err = parseName(line, -1, 0)
368+
f.NewName, _, err = parseName(line, 0, 0)
305369
return
306370
}
307371

@@ -349,14 +413,14 @@ func parseMode(s string) (os.FileMode, error) {
349413

350414
// parseName extracts a file name from the start of a string and returns the
351415
// name and the index of the first character after the name. If the name is
352-
// unquoted and term is non-negative, parsing stops at the first occurrence of
353-
// term. Otherwise parsing of unquoted names stops at the first space or tab.
416+
// unquoted and term is non-zero, parsing stops at the first occurrence of
417+
// term.
354418
//
355419
// If the name is exactly "/dev/null", no further processing occurs. Otherwise,
356420
// if dropPrefix is greater than zero, that number of prefix components
357421
// separated by forward slashes are dropped from the name and any duplicate
358422
// slashes are collapsed.
359-
func parseName(s string, term rune, dropPrefix int) (name string, n int, err error) {
423+
func parseName(s string, term byte, dropPrefix int) (name string, n int, err error) {
360424
if len(s) > 0 && s[0] == '"' {
361425
name, n, err = parseQuotedName(s)
362426
} else {
@@ -387,15 +451,12 @@ func parseQuotedName(s string) (name string, n int, err error) {
387451
return name, n, err
388452
}
389453

390-
func parseUnquotedName(s string, term rune) (name string, n int, err error) {
454+
func parseUnquotedName(s string, term byte) (name string, n int, err error) {
391455
for n = 0; n < len(s); n++ {
392456
if s[n] == '\n' {
393457
break
394458
}
395-
if term >= 0 && rune(s[n]) == term {
396-
break
397-
}
398-
if term < 0 && (s[n] == ' ' || s[n] == '\t') {
459+
if term > 0 && s[n] == term {
399460
break
400461
}
401462
}
@@ -440,6 +501,17 @@ func cleanName(name string, drop int) string {
440501
return b.String()
441502
}
442503

504+
// trimTreePrefix removes up to n leading directory components from name.
505+
func trimTreePrefix(name string, n int) string {
506+
i := 0
507+
for ; i < len(name) && n > 0; i++ {
508+
if name[i] == '/' {
509+
n--
510+
}
511+
}
512+
return name[i:]
513+
}
514+
443515
// hasEpochTimestamp returns true if the string ends with a POSIX-formatted
444516
// timestamp for the UNIX epoch after a tab character. According to git, this
445517
// is used by GNU diff to mark creations and deletions.
@@ -468,3 +540,7 @@ func hasEpochTimestamp(s string) bool {
468540
}
469541
return true
470542
}
543+
544+
func isSpace(c byte) bool {
545+
return c == ' ' || c == '\t' || c == '\n'
546+
}

gitdiff/file_header_test.go

+35-8
Original file line numberDiff line numberDiff line change
@@ -310,7 +310,7 @@ func TestCleanName(t *testing.T) {
310310
func TestParseName(t *testing.T) {
311311
tests := map[string]struct {
312312
Input string
313-
Term rune
313+
Term byte
314314
Drop int
315315
Output string
316316
N int
@@ -334,14 +334,17 @@ func TestParseName(t *testing.T) {
334334
"dropPrefix": {
335335
Input: "a/dir/file.txt", Drop: 1, Output: "dir/file.txt", N: 14,
336336
},
337-
"multipleNames": {
338-
Input: "dir/a.txt dir/b.txt", Term: -1, Output: "dir/a.txt", N: 9,
337+
"unquotedWithSpaces": {
338+
Input: "dir/with spaces.txt", Output: "dir/with spaces.txt", N: 19,
339+
},
340+
"unquotedWithTrailingSpaces": {
341+
Input: "dir/with spaces.space ", Output: "dir/with spaces.space ", N: 23,
339342
},
340343
"devNull": {
341344
Input: "/dev/null", Term: '\t', Drop: 1, Output: "/dev/null", N: 9,
342345
},
343-
"newlineAlwaysSeparates": {
344-
Input: "dir/file.txt\n", Term: 0, Output: "dir/file.txt", N: 12,
346+
"newlineSeparates": {
347+
Input: "dir/file.txt\n", Output: "dir/file.txt", N: 12,
345348
},
346349
"emptyString": {
347350
Input: "", Err: true,
@@ -630,9 +633,33 @@ func TestParseGitHeaderName(t *testing.T) {
630633
Input: "a/dir/foo.txt b/dir/bar.txt",
631634
Output: "",
632635
},
633-
"missingSecondName": {
634-
Input: "a/dir/foo.txt",
635-
Err: true,
636+
"matchingNamesWithSpaces": {
637+
Input: "a/dir/file with spaces.txt b/dir/file with spaces.txt",
638+
Output: "dir/file with spaces.txt",
639+
},
640+
"matchingNamesWithTrailingSpaces": {
641+
Input: "a/dir/spaces b/dir/spaces ",
642+
Output: "dir/spaces ",
643+
},
644+
"matchingNamesQuoted": {
645+
Input: `"a/dir/\"quotes\".txt" "b/dir/\"quotes\".txt"`,
646+
Output: `dir/"quotes".txt`,
647+
},
648+
"matchingNamesFirstQuoted": {
649+
Input: `"a/dir/file.txt" b/dir/file.txt`,
650+
Output: "dir/file.txt",
651+
},
652+
"matchingNamesSecondQuoted": {
653+
Input: `a/dir/file.txt "b/dir/file.txt"`,
654+
Output: "dir/file.txt",
655+
},
656+
"noSecondName": {
657+
Input: "a/dir/foo.txt",
658+
Output: "",
659+
},
660+
"noSecondNameQuoted": {
661+
Input: `"a/dir/foo.txt"`,
662+
Output: "",
636663
},
637664
"invalidName": {
638665
Input: `"a/dir/file.txt b/dir/file.txt`,

0 commit comments

Comments
 (0)