From 65b18f8912870fb6572233d1a9c6edc238930c09 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Anton=20S=C3=B6dergren?= Date: Fri, 24 Feb 2017 11:58:45 +0100 Subject: [PATCH 1/2] * Added the one-lined option to App.java * Added a "signature" to WikipediaPage.java * Added the OneLineWikipediaPageWriter.java that writes each article to a single row --- .../cs/nlp/io/OneLineWikipediaPageWriter.java | 89 +++++++++++++++++++ .../cs/nlp/mediawiki/model/WikipediaPage.java | 2 + .../java/se/lth/cs/nlp/wikiforia/App.java | 19 ++-- 3 files changed, 104 insertions(+), 6 deletions(-) create mode 100644 src/main/java/se/lth/cs/nlp/io/OneLineWikipediaPageWriter.java diff --git a/src/main/java/se/lth/cs/nlp/io/OneLineWikipediaPageWriter.java b/src/main/java/se/lth/cs/nlp/io/OneLineWikipediaPageWriter.java new file mode 100644 index 0000000..bf9991f --- /dev/null +++ b/src/main/java/se/lth/cs/nlp/io/OneLineWikipediaPageWriter.java @@ -0,0 +1,89 @@ +package se.lth.cs.nlp.io; + +import se.lth.cs.nlp.mediawiki.model.WikipediaPage; +import se.lth.cs.nlp.pipeline.Sink; + +import java.io.File; +import java.io.IOError; +import java.io.IOException; +import java.nio.ByteBuffer; +import java.nio.channels.FileChannel; +import java.nio.file.Paths; +import java.nio.file.StandardOpenOption; +import java.util.List; + +/** + * This file/class is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + *

+ * This file/class is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + *

+ * You should have received a copy of the GNU General Public License + * along with this forked version of wikiforia. + * If not, see . + * + * @author Anton Södergren - karl.aj.sodergren@gmail.com + */ +public class OneLineWikipediaPageWriter implements Sink { + + private final File output; + private FileChannel fileChannel; + + /** + * Default constructor + * + * @param output which file to write to + */ + public OneLineWikipediaPageWriter(File output) { + try { + this.output = output; + + //Fix so it doesn't crash on file not exists + File f = new File(output.toURI()); + f.createNewFile(); + + this.fileChannel = FileChannel.open(Paths.get(output.toURI()), StandardOpenOption.WRITE, StandardOpenOption.TRUNCATE_EXISTING); + } catch (IOException e) { + throw new IOError(e); + } + } + + @Override + public synchronized void process(List batch) { + if (this.fileChannel == null) + return; + + try { + if (batch.size() == 0) { + this.fileChannel.write(ByteBuffer.wrap("\n".getBytes("utf-8"))); + this.fileChannel.close(); + this.fileChannel = null; + return; + } + + for (WikipediaPage wikipediaPage : batch) { + String text = wikipediaPage.getText(); + if (text.length() > 0) { + //Make it one line + text = text.replaceAll("\n", " "); + //Append an id to the start of the line. {{page:id}} + this.fileChannel.write(ByteBuffer.wrap(wikipediaPage.getSignature().getBytes("utf-8"))); + this.fileChannel.write(ByteBuffer.wrap(text.getBytes("utf-8"))); + this.fileChannel.write(ByteBuffer.wrap("\n".getBytes("utf-8"))); + } + } + } catch (IOException e) { + throw new IOError(e); + } + } + + @Override + public String toString() { + return String.format("XML Writer { target: %s }", output.getAbsolutePath()); + } +} diff --git a/src/main/java/se/lth/cs/nlp/mediawiki/model/WikipediaPage.java b/src/main/java/se/lth/cs/nlp/mediawiki/model/WikipediaPage.java index 92b1a28..151d5a0 100644 --- a/src/main/java/se/lth/cs/nlp/mediawiki/model/WikipediaPage.java +++ b/src/main/java/se/lth/cs/nlp/mediawiki/model/WikipediaPage.java @@ -30,4 +30,6 @@ public WikipediaPage(Page page, String text) { public String getText() { return text; } + + public String getSignature() { return "{{page:"+this.getId()+"}}"; } } diff --git a/src/main/java/se/lth/cs/nlp/wikiforia/App.java b/src/main/java/se/lth/cs/nlp/wikiforia/App.java index 1225cd4..233a455 100644 --- a/src/main/java/se/lth/cs/nlp/wikiforia/App.java +++ b/src/main/java/se/lth/cs/nlp/wikiforia/App.java @@ -20,6 +20,7 @@ import org.apache.commons.lang3.StringUtils; import org.slf4j.Logger; import org.slf4j.LoggerFactory; +import se.lth.cs.nlp.io.OneLineWikipediaPageWriter; import se.lth.cs.nlp.io.PlainTextWikipediaPageWriter; import se.lth.cs.nlp.io.SimpleHadoopTextWriter; import se.lth.cs.nlp.io.XmlWikipediaPageWriter; @@ -102,7 +103,7 @@ public class App @SuppressWarnings("static-access") private static final Option output = OptionBuilder.withLongOpt("output") - .withDescription("xml output filepath") + .withDescription("output filepath") .hasArg() .isRequired() .withArgName("path") @@ -117,13 +118,14 @@ public class App @SuppressWarnings("static-access") private static final Option outputFormatOption = OptionBuilder.withLongOpt("output-format") - .withDescription("Output format : xml or plain-text") + .withDescription("Output format : xml, plain-text or one-line") .hasArg() .withArgName("outputformat") - .create("outputformat"); + .create("of"); private static final String OUTPUT_FORMAT_XML = "xml"; private static final String OUTPUT_FORMAT_PLAIN_TEXT = "plain-text"; + private static final String OUTPUT_FORMAT_ONE_LINE = "one-line"; private static final String OUTPUT_FORMAT_DEFAULT = OUTPUT_FORMAT_XML; /** @@ -248,8 +250,13 @@ public static void convert( * @return Sink */ private static Sink getSink(String outputFormat, File outputPath) { - return outputFormat != null && outputFormat.trim().equalsIgnoreCase(OUTPUT_FORMAT_PLAIN_TEXT) - ? new PlainTextWikipediaPageWriter(outputPath) : new XmlWikipediaPageWriter(outputPath); + if (outputFormat != null) { + if (outputFormat.trim().equalsIgnoreCase(OUTPUT_FORMAT_PLAIN_TEXT)) + return new PlainTextWikipediaPageWriter(outputPath); + if (outputFormat.trim().equalsIgnoreCase(OUTPUT_FORMAT_ONE_LINE)) + return new OneLineWikipediaPageWriter(outputPath); + } + return new XmlWikipediaPageWriter(outputPath); } /** @@ -283,7 +290,7 @@ public static void convert( * @param numThreads the number of threads to use * @param batchsize the size of a batch * @param filters All filters to append - * @param outputFormat format of output i.e. xml or plain-text + * @param outputFormat format of output i.e. xml, plain-text or one-line */ public static void convert( TemplateConfig config, From 8be4cd2497f00d55c803f50720c9a494e4527ba6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Anton=20S=C3=B6dergren?= Date: Fri, 24 Feb 2017 12:15:42 +0100 Subject: [PATCH 2/2] Prettified ugly indenting --- .../java/se/lth/cs/nlp/io/OneLineWikipediaPageWriter.java | 4 ++-- src/main/java/se/lth/cs/nlp/wikiforia/App.java | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/src/main/java/se/lth/cs/nlp/io/OneLineWikipediaPageWriter.java b/src/main/java/se/lth/cs/nlp/io/OneLineWikipediaPageWriter.java index bf9991f..03ae0d3 100644 --- a/src/main/java/se/lth/cs/nlp/io/OneLineWikipediaPageWriter.java +++ b/src/main/java/se/lth/cs/nlp/io/OneLineWikipediaPageWriter.java @@ -17,12 +17,12 @@ * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. - *

+ * * This file/class is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. - *

+ * * You should have received a copy of the GNU General Public License * along with this forked version of wikiforia. * If not, see . diff --git a/src/main/java/se/lth/cs/nlp/wikiforia/App.java b/src/main/java/se/lth/cs/nlp/wikiforia/App.java index 233a455..8e04747 100644 --- a/src/main/java/se/lth/cs/nlp/wikiforia/App.java +++ b/src/main/java/se/lth/cs/nlp/wikiforia/App.java @@ -252,7 +252,7 @@ public static void convert( private static Sink getSink(String outputFormat, File outputPath) { if (outputFormat != null) { if (outputFormat.trim().equalsIgnoreCase(OUTPUT_FORMAT_PLAIN_TEXT)) - return new PlainTextWikipediaPageWriter(outputPath); + return new PlainTextWikipediaPageWriter(outputPath); if (outputFormat.trim().equalsIgnoreCase(OUTPUT_FORMAT_ONE_LINE)) return new OneLineWikipediaPageWriter(outputPath); }