Skip to content

Commit c63985a

Browse files
committed
TIKA-4389 cleanups for TIKA-4381 (#2144)
(cherry picked from commit 5737f09)
1 parent 0b58752 commit c63985a

File tree

4 files changed

+107
-76
lines changed

4 files changed

+107
-76
lines changed

tika-core/src/main/java/org/apache/tika/metadata/MAPI.java

+1-1
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,7 @@ public interface MAPI {
2626

2727
String PREFIX_MAPI_META = "mapi" + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER;
2828
String PREFIX_MAPI_ATTACH_META = "mapi:attach" + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER;
29-
String PREFIX_MAPI_RAW_META = PREFIX_MAPI_META + "raw" + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER;
29+
String PREFIX_MAPI_PROPERTY = PREFIX_MAPI_META + "property" + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER;
3030

3131
/**
3232
* MAPI message class. What type of .msg/MAPI file is it?

tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/msg/ExtendedMetadataExtractor.java

+71-58
Original file line numberDiff line numberDiff line change
@@ -42,7 +42,6 @@
4242

4343
import org.apache.tika.metadata.MAPI;
4444
import org.apache.tika.metadata.Metadata;
45-
import org.apache.tika.parser.microsoft.OutlookExtractor;
4645
import org.apache.tika.utils.StringUtils;
4746

4847
/**
@@ -59,60 +58,23 @@ public class ExtendedMetadataExtractor {
5958
loadProperties();
6059
}
6160

62-
63-
private static List<Types.MAPIType> parseDataTypes(String[] arr) {
64-
if (arr.length == 1) {
65-
Types.MAPIType type = parseDataType(arr[0]);
66-
if (type != null) {
67-
return List.of(type);
68-
}
69-
return Collections.EMPTY_LIST;
70-
}
71-
List<Types.MAPIType> types = new ArrayList<>();
72-
for (String s : arr) {
73-
Types.MAPIType type = parseDataType(s);
74-
if (type != null) {
75-
types.add(type);
76-
}
77-
}
78-
return types;
79-
}
80-
81-
private static Types.MAPIType parseDataType(String s) {
82-
if (StringUtils.isBlank(s)) {
83-
return null;
84-
}
85-
String[] parts = s.split(", ");
86-
if (parts.length != 2) {
87-
throw new IllegalArgumentException("expected two parts: " + s);
88-
}
89-
String num = parts[1];
90-
if (num.startsWith("0x")) {
91-
num = num.substring(2);
61+
public static void extract(MAPIMessage msg, Metadata metadata) {
62+
if (msg.getNameIdChunks() == null) {
63+
return;
9264
}
93-
int id = Integer.parseInt(num, 16);
94-
Types.MAPIType type = Types.getById(id);
95-
if (type == null) {
96-
//TODO:
97-
/*
98-
PtypRestriction, 0x00FD
99-
PtypRuleAction, 0x00FE
100-
PtypServerId, 0x00FB
101-
*/
102-
return Types.createCustom(id);
65+
if (msg.getMainChunks() == null || msg.getMainChunks().getRawProperties() == null) {
66+
return;
10367
}
104-
return type;
105-
}
106-
107-
108-
public static void extract(MAPIMessage msg, Metadata metadata) {
109-
//prep our custom nameIdchunk handler
68+
//prep our custom nameIdChunk handler
11069
TikaNameIdChunks tikaNameIdChunks = new TikaNameIdChunks();
11170
//short-circuit for files that have an empty nameIdChunk
11271
long len = 0;
11372
for (Chunk chunk : msg
11473
.getNameIdChunks()
11574
.getAll()) {
75+
if (chunk == null) {
76+
continue;
77+
}
11678
tikaNameIdChunks.record(chunk);
11779
if (chunk instanceof ByteChunk) {
11880
byte[] value = ((ByteChunk)chunk).getValue();
@@ -124,14 +86,21 @@ public static void extract(MAPIMessage msg, Metadata metadata) {
12486
if (len == 0) {
12587
return;
12688
}
127-
tikaNameIdChunks.chunksComplete();
89+
try {
90+
tikaNameIdChunks.chunksComplete();
91+
} catch (IllegalStateException e) {
92+
LOGGER.warn("bad namechunks stream", e);
93+
}
12894
for (Map.Entry<MAPIProperty, PropertyValue> e : msg
12995
.getMainChunks()
13096
.getRawProperties()
13197
.entrySet()) {
13298
//the mapiproperties from POI are the literal storage id for that particular file.
13399
//Those storage ids must be mapped via the name chunk ids into a known id
134100
PropertyValue v = e.getValue();
101+
if (v == null) {
102+
continue;
103+
}
135104
List<MAPITag> mapiTags = tikaNameIdChunks.getTags(e.getKey().id);
136105
MAPITagPair pair = null;
137106
for (MAPITag mapiTag : mapiTags) {
@@ -146,7 +115,6 @@ public static void extract(MAPIMessage msg, Metadata metadata) {
146115
}
147116
updateMetadata(pair, v, metadata);
148117
}
149-
150118
}
151119

152120

@@ -180,7 +148,7 @@ private static void updateMetadata(MAPITagPair pair, PropertyValue propertyValue
180148
if (!includeType(propertyValue)) {
181149
return;
182150
}
183-
String key = MAPI.PREFIX_MAPI_RAW_META + pair.tikaMapiProperty.name;
151+
String key = MAPI.PREFIX_MAPI_PROPERTY + pair.tikaMapiProperty.name;
184152
Types.MAPIType type = propertyValue.getActualType();
185153
if (type == Types.TIME || type == Types.MV_TIME || type == Types.APP_TIME || type == Types.MV_APP_TIME) {
186154
Calendar calendar = (Calendar) propertyValue.getValue();
@@ -190,8 +158,12 @@ private static void updateMetadata(MAPITagPair pair, PropertyValue propertyValue
190158
.toString();
191159
metadata.add(key, calendarString);
192160
} else if (type == Types.BOOLEAN) {
193-
metadata.add(key, Boolean.toString((boolean) propertyValue.getValue()));
194-
} else {
161+
Boolean val = (Boolean)propertyValue.getValue();
162+
if (val == null) {
163+
return;
164+
}
165+
metadata.add(key, Boolean.toString(val));
166+
} else if (! StringUtils.isBlank(propertyValue.toString())) {
195167
metadata.add(key, propertyValue.toString());
196168
}
197169

@@ -205,11 +177,6 @@ private static boolean includeType(PropertyValue propertyValue) {
205177
return true;
206178
}
207179

208-
private static boolean isString(PropertyValue propertyValue) {
209-
Types.MAPIType mapiType = propertyValue.getActualType();
210-
return mapiType == Types.ASCII_STRING || mapiType == Types.MV_ASCII_STRING || mapiType == Types.MV_UNICODE_STRING || mapiType == Types.UNICODE_STRING;
211-
}
212-
213180
private static class TikaMapiProperty {
214181
String name;
215182
ClassID classID; // can be null
@@ -237,7 +204,7 @@ private static void loadProperties() {
237204
.toUUIDString(), setType.getClassID());
238205
}
239206
try (BufferedReader r = new BufferedReader(
240-
new InputStreamReader(OutlookExtractor.class.getResourceAsStream("/org/apache/tika/parser/microsoft/msg/props_table.txt"), UTF_8))) {
207+
new InputStreamReader(ExtendedMetadataExtractor.class.getResourceAsStream("/org/apache/tika/parser/microsoft/msg/props_table.txt"), UTF_8))) {
241208
String line = r.readLine();
242209
while (line != null) {
243210
if (line.isBlank() || line.startsWith("#")) {
@@ -309,4 +276,50 @@ public MAPITagPair(MAPITag mapiTag, TikaMapiProperty tikaMapiProperty) {
309276
this.tikaMapiProperty = tikaMapiProperty;
310277
}
311278
}
279+
280+
281+
private static List<Types.MAPIType> parseDataTypes(String[] arr) {
282+
if (arr.length == 1) {
283+
Types.MAPIType type = parseDataType(arr[0]);
284+
if (type != null) {
285+
return List.of(type);
286+
}
287+
return Collections.EMPTY_LIST;
288+
}
289+
List<Types.MAPIType> types = new ArrayList<>();
290+
for (String s : arr) {
291+
Types.MAPIType type = parseDataType(s);
292+
if (type != null) {
293+
types.add(type);
294+
}
295+
}
296+
return types;
297+
}
298+
299+
private static Types.MAPIType parseDataType(String s) {
300+
if (StringUtils.isBlank(s)) {
301+
return null;
302+
}
303+
String[] parts = s.split(", ");
304+
if (parts.length != 2) {
305+
throw new IllegalArgumentException("expected two parts: " + s);
306+
}
307+
String num = parts[1];
308+
if (num.startsWith("0x")) {
309+
num = num.substring(2);
310+
}
311+
int id = Integer.parseInt(num, 16);
312+
Types.MAPIType type = Types.getById(id);
313+
if (type == null) {
314+
//TODO:
315+
/*
316+
PtypRestriction, 0x00FD
317+
PtypRuleAction, 0x00FE
318+
PtypServerId, 0x00FB
319+
*/
320+
return Types.createCustom(id);
321+
}
322+
return type;
323+
}
324+
312325
}

tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/msg/TikaNameIdChunks.java

+4-2
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@ Licensed to the Apache Software Foundation (ASF) under one or more
1818
package org.apache.tika.parser.microsoft.msg;
1919

2020
import java.util.ArrayList;
21+
import java.util.Collections;
2122
import java.util.HashMap;
2223
import java.util.List;
2324
import java.util.Locale;
@@ -132,10 +133,11 @@ public void chunksComplete() {
132133
loadTags();
133134
}
134135

136+
//does not return null
135137
public List<MAPITag> getTags(int storageId) {
136138
List<MAPITag> tags = mapiTagMap.get(storageId);
137139
if (tags == null) {
138-
return new ArrayList<>();
140+
return Collections.emptyList();
139141
}
140142
return tags;
141143
}
@@ -235,7 +237,7 @@ private long getPropertyTag(long streamID, long nameOffset, long propertyNameCRC
235237
return 0;
236238
}
237239
for (Chunk chunk : chunks) {
238-
if (chunk.getType() != Types.BINARY || chunk.getChunkId() != streamID) {
240+
if (chunk == null || chunk.getType() != Types.BINARY || chunk.getChunkId() != streamID) {
239241
continue;
240242
}
241243
byte[] matchChunkBytes = ((ByteChunk) chunk).getValue();

tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/OutlookParserTest.java

+31-15
Original file line numberDiff line numberDiff line change
@@ -317,17 +317,17 @@ public void testAppointmentExtendedMetadata() throws Exception {
317317

318318
List<Metadata> metadataList = getRecursiveMetadata("testMSG_Appointment.msg", parseContext);
319319
Metadata m = metadataList.get(0);
320-
assertTrue(m.get("mapi:raw:PidLidAppointmentEndWhole").contains("2017-02-28T19"));
321-
assertTrue(m.get("mapi:raw:PidLidAppointmentStartWhole").contains("2017-02-28T18"));
322-
assertTrue(m.get("mapi:raw:PidLidClipStart").contains("2017-02-28T18"));
323-
assertTrue(m.get("mapi:raw:PidLidClipEnd").contains("2017-02-28T19"));
324-
assertTrue(m.get("mapi:raw:PidLidCommonStart").contains("2017-02-28T18"));
325-
assertTrue(m.get("mapi:raw:PidLidCommonEnd").contains("2017-02-28T19"));
326-
assertTrue(m.get("mapi:raw:PidLidReminderSignalTime").contains("4501-01-01T00"));
327-
assertTrue(m.get("mapi:raw:PidLidReminderTime").contains("2017-02-28T18"));
328-
assertTrue(m.get("mapi:raw:PidLidValidFlagStringProof").contains("2017-02-28T18:42"));
329-
assertEquals("0", m.get("mapi:raw:PidLidAppointmentSequence"));
330-
assertEquals("false", m.get("mapi:raw:PidLidRecurring"));
320+
assertTrue(m.get("mapi:property:PidLidAppointmentEndWhole").contains("2017-02-28T19"));
321+
assertTrue(m.get("mapi:property:PidLidAppointmentStartWhole").contains("2017-02-28T18"));
322+
assertTrue(m.get("mapi:property:PidLidClipStart").contains("2017-02-28T18"));
323+
assertTrue(m.get("mapi:property:PidLidClipEnd").contains("2017-02-28T19"));
324+
assertTrue(m.get("mapi:property:PidLidCommonStart").contains("2017-02-28T18"));
325+
assertTrue(m.get("mapi:property:PidLidCommonEnd").contains("2017-02-28T19"));
326+
assertTrue(m.get("mapi:property:PidLidReminderSignalTime").contains("4501-01-01T00"));
327+
assertTrue(m.get("mapi:property:PidLidReminderTime").contains("2017-02-28T18"));
328+
assertTrue(m.get("mapi:property:PidLidValidFlagStringProof").contains("2017-02-28T18:42"));
329+
assertEquals("0", m.get("mapi:property:PidLidAppointmentSequence"));
330+
assertEquals("false", m.get("mapi:property:PidLidRecurring"));
331331
}
332332

333333
@Test
@@ -338,12 +338,28 @@ public void testTaskExtendedMetadata() throws Exception {
338338
parseContext.set(OfficeParserConfig.class, officeParserConfig);
339339
List<Metadata> metadataList = getRecursiveMetadata("testMSG_Task.msg", parseContext);
340340
Metadata m = metadataList.get(0);
341-
assertTrue(m.get("mapi:raw:PidLidToDoOrdinalDate").contains("2017-02-28T18:44"));
342-
assertTrue(m.get("mapi:raw:PidLidValidFlagStringProof").contains("2017-02-28T18:44"));
343-
assertEquals("0", m.get("mapi:raw:PidLidTaskActualEffort"));
344-
assertEquals("false", m.get("mapi:raw:PidLidTeamTask"));
341+
assertTrue(m.get("mapi:property:PidLidToDoOrdinalDate").contains("2017-02-28T18:44"));
342+
assertTrue(m.get("mapi:property:PidLidValidFlagStringProof").contains("2017-02-28T18:44"));
343+
assertEquals("0", m.get("mapi:property:PidLidTaskActualEffort"));
344+
assertEquals("false", m.get("mapi:property:PidLidTeamTask"));
345345
}
346346

347+
@Test
348+
public void testContactExtendedMetadata() throws Exception {
349+
List<Metadata> metadataList = getRecursiveMetadata("testMSG_Contact.msg");
350+
Metadata m = metadataList.get(0);
351+
assertEquals("2017-02-28T18:41:37Z", m.get("mapi:property:PidLidValidFlagStringProof"));
352+
}
353+
354+
355+
@Test
356+
public void testPostExtendedMetadata() throws Exception {
357+
List<Metadata> metadataList = getRecursiveMetadata("testMSG_Post.msg");
358+
Metadata m = metadataList.get(0);
359+
assertEquals("2017-02-28T18:47:11Z", m.get("mapi:property:PidLidValidFlagStringProof"));
360+
}
361+
362+
347363
@Test
348364
public void testHandlingAllAlternativesBodies() throws Exception {
349365
//test that default only has one body

0 commit comments

Comments
 (0)