Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -212,10 +212,7 @@ public void parse(InputStream stream, ContentHandler handler, Metadata metadata,
xhtml.startDocument();
try {
Metadata entrydata = new Metadata();
if (cis instanceof GzipCompressorInputStream) {
extractGzipMetadata((GzipCompressorInputStream) cis, entrydata);
}
setName(metadata, entrydata);
setNameAndInternalPath(cis, metadata, entrydata);

// Use the delegate parser to parse the compressed document
EmbeddedDocumentExtractor extractor =
Expand All @@ -230,33 +227,44 @@ public void parse(InputStream stream, ContentHandler handler, Metadata metadata,
xhtml.endDocument();
}

private void extractGzipMetadata(GzipCompressorInputStream gzcis, Metadata metadata) {
GzipParameters gzipParameters = gzcis.getMetaData();
if (gzipParameters == null) {
return;
}
String name = gzipParameters.getFileName();
if (!StringUtils.isBlank(name)) {
metadata.set(TikaCoreProperties.INTERNAL_PATH, name);
private String getNameFromGzipMetadataIfPossible(CompressorInputStream cis) {
if (cis instanceof GzipCompressorInputStream) {
GzipCompressorInputStream gzcis = (GzipCompressorInputStream) cis;
GzipParameters gzipParameters = gzcis.getMetaData();
if (gzipParameters == null) {
return null;
}
String name = gzipParameters.getFileName();
if (!StringUtils.isBlank(name)) {
return name;
} else {
return null;
}
}
//TODO: modification, OS, comment
return null;
}

private void setName(Metadata parentMetadata, Metadata metadata) {
String name = parentMetadata.get(TikaCoreProperties.RESOURCE_NAME_KEY);
//if parent's name is blank stop now
if (StringUtils.isBlank(name)) {
return;
}
if (name.endsWith(".tgz") || name.endsWith(".tbz") || name.endsWith(".tbz2")) {
name = name.substring(0, name.lastIndexOf(".")) + ".tar";
} else if (name.endsWith(".bz") || name.endsWith("gz") || name.endsWith(".bz2") || name.endsWith(".xz") || name.endsWith(".zlib") || name.endsWith(".pack") ||
name.endsWith(".br")) {
name = name.substring(0, name.lastIndexOf("."));
} else if (!name.isEmpty()) {
name = GzipUtils.getUncompressedFileName(name);
private void setNameAndInternalPath(CompressorInputStream cis, Metadata parentMetadata, Metadata metadata) {
String name = getNameFromGzipMetadataIfPossible(cis);

if (name == null) {
name = parentMetadata.get(TikaCoreProperties.RESOURCE_NAME_KEY);
//if parent's name is blank stop now
if (StringUtils.isBlank(name)) {
return;
}
if (name.endsWith(".tgz") || name.endsWith(".tbz") || name.endsWith(".tbz2")) {
name = name.substring(0, name.lastIndexOf(".")) + ".tar";
} else if (name.endsWith(".bz") || name.endsWith("gz") || name.endsWith(".bz2") || name.endsWith(".xz") || name.endsWith(".zlib") || name.endsWith(".pack") ||
name.endsWith(".br")) {
name = name.substring(0, name.lastIndexOf("."));
} else if (!name.isEmpty()) {
name = GzipUtils.getUncompressedFileName(name);
}
}

metadata.set(TikaCoreProperties.RESOURCE_NAME_KEY, name);
metadata.set(TikaCoreProperties.INTERNAL_PATH, name);
}

/**
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,7 @@ public void testEmbedded() throws Exception {
assertEquals(1, tracker.mediatypes.size());
assertEquals(1, tracker.modifiedAts.size());

assertEquals(null, tracker.filenames.get(0));
assertEquals("test-documents.tar", tracker.filenames.get(0));
assertEquals(null, tracker.mediatypes.get(0));
assertEquals(null, tracker.modifiedAts.get(0));

Expand Down
Binary file not shown.
Original file line number Diff line number Diff line change
Expand Up @@ -174,6 +174,16 @@ public void testTarball() throws Exception {
"/test-documents.tar"), actualEmbeddedPaths);
}

@Test
public void testTarballWithoutGzipNameMetadata() throws Exception {
List<Metadata> list = getRecursiveMetadata("test-documents-no-name-metadata.tgz");
Metadata last = list.get(list.size() - 1);
String internalPath = last.get(TikaCoreProperties.INTERNAL_PATH);
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The point of internal_path is to store what the file contained about the internal path of a resource. This metadata field should tell the user "this was the path that was literally stored in the container file. Tika did no guesswork here".

String embeddedResourcePath = last.get(TikaCoreProperties.EMBEDDED_RESOURCE_PATH);
assertEquals("test-documents-no-name-metadata.tar", internalPath);
assertEquals("/test-documents-no-name-metadata.tar", embeddedResourcePath);
}

@Test
public void testCharLimitNoThrowOnWriteLimit() throws Exception {
ParseContext context = new ParseContext();
Expand Down