diff --git a/src/main/java/se/lth/cs/nlp/io/OneLineWikipediaPageWriter.java b/src/main/java/se/lth/cs/nlp/io/OneLineWikipediaPageWriter.java new file mode 100644 index 0000000..03ae0d3 --- /dev/null +++ b/src/main/java/se/lth/cs/nlp/io/OneLineWikipediaPageWriter.java @@ -0,0 +1,89 @@ +package se.lth.cs.nlp.io; + +import se.lth.cs.nlp.mediawiki.model.WikipediaPage; +import se.lth.cs.nlp.pipeline.Sink; + +import java.io.File; +import java.io.IOError; +import java.io.IOException; +import java.nio.ByteBuffer; +import java.nio.channels.FileChannel; +import java.nio.file.Paths; +import java.nio.file.StandardOpenOption; +import java.util.List; + +/** + * This file/class is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This file/class is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this forked version of wikiforia. + * If not, see . + * + * @author Anton Södergren - karl.aj.sodergren@gmail.com + */ +public class OneLineWikipediaPageWriter implements Sink { + + private final File output; + private FileChannel fileChannel; + + /** + * Default constructor + * + * @param output which file to write to + */ + public OneLineWikipediaPageWriter(File output) { + try { + this.output = output; + + //Fix so it doesn't crash on file not exists + File f = new File(output.toURI()); + f.createNewFile(); + + this.fileChannel = FileChannel.open(Paths.get(output.toURI()), StandardOpenOption.WRITE, StandardOpenOption.TRUNCATE_EXISTING); + } catch (IOException e) { + throw new IOError(e); + } + } + + @Override + public synchronized void process(List batch) { + if (this.fileChannel == null) + return; + + try { + if (batch.size() == 0) { + this.fileChannel.write(ByteBuffer.wrap("\n".getBytes("utf-8"))); + this.fileChannel.close(); + this.fileChannel = null; + return; + } + + for (WikipediaPage wikipediaPage : batch) { + String text = wikipediaPage.getText(); + if (text.length() > 0) { + //Make it one line + text = text.replaceAll("\n", " "); + //Append an id to the start of the line. {{page:id}} + this.fileChannel.write(ByteBuffer.wrap(wikipediaPage.getSignature().getBytes("utf-8"))); + this.fileChannel.write(ByteBuffer.wrap(text.getBytes("utf-8"))); + this.fileChannel.write(ByteBuffer.wrap("\n".getBytes("utf-8"))); + } + } + } catch (IOException e) { + throw new IOError(e); + } + } + + @Override + public String toString() { + return String.format("XML Writer { target: %s }", output.getAbsolutePath()); + } +} diff --git a/src/main/java/se/lth/cs/nlp/mediawiki/model/WikipediaPage.java b/src/main/java/se/lth/cs/nlp/mediawiki/model/WikipediaPage.java index 92b1a28..151d5a0 100644 --- a/src/main/java/se/lth/cs/nlp/mediawiki/model/WikipediaPage.java +++ b/src/main/java/se/lth/cs/nlp/mediawiki/model/WikipediaPage.java @@ -30,4 +30,6 @@ public WikipediaPage(Page page, String text) { public String getText() { return text; } + + public String getSignature() { return "{{page:"+this.getId()+"}}"; } } diff --git a/src/main/java/se/lth/cs/nlp/wikiforia/App.java b/src/main/java/se/lth/cs/nlp/wikiforia/App.java index 1225cd4..8e04747 100644 --- a/src/main/java/se/lth/cs/nlp/wikiforia/App.java +++ b/src/main/java/se/lth/cs/nlp/wikiforia/App.java @@ -20,6 +20,7 @@ import org.apache.commons.lang3.StringUtils; import org.slf4j.Logger; import org.slf4j.LoggerFactory; +import se.lth.cs.nlp.io.OneLineWikipediaPageWriter; import se.lth.cs.nlp.io.PlainTextWikipediaPageWriter; import se.lth.cs.nlp.io.SimpleHadoopTextWriter; import se.lth.cs.nlp.io.XmlWikipediaPageWriter; @@ -102,7 +103,7 @@ public class App @SuppressWarnings("static-access") private static final Option output = OptionBuilder.withLongOpt("output") - .withDescription("xml output filepath") + .withDescription("output filepath") .hasArg() .isRequired() .withArgName("path") @@ -117,13 +118,14 @@ public class App @SuppressWarnings("static-access") private static final Option outputFormatOption = OptionBuilder.withLongOpt("output-format") - .withDescription("Output format : xml or plain-text") + .withDescription("Output format : xml, plain-text or one-line") .hasArg() .withArgName("outputformat") - .create("outputformat"); + .create("of"); private static final String OUTPUT_FORMAT_XML = "xml"; private static final String OUTPUT_FORMAT_PLAIN_TEXT = "plain-text"; + private static final String OUTPUT_FORMAT_ONE_LINE = "one-line"; private static final String OUTPUT_FORMAT_DEFAULT = OUTPUT_FORMAT_XML; /** @@ -248,8 +250,13 @@ public static void convert( * @return Sink */ private static Sink getSink(String outputFormat, File outputPath) { - return outputFormat != null && outputFormat.trim().equalsIgnoreCase(OUTPUT_FORMAT_PLAIN_TEXT) - ? new PlainTextWikipediaPageWriter(outputPath) : new XmlWikipediaPageWriter(outputPath); + if (outputFormat != null) { + if (outputFormat.trim().equalsIgnoreCase(OUTPUT_FORMAT_PLAIN_TEXT)) + return new PlainTextWikipediaPageWriter(outputPath); + if (outputFormat.trim().equalsIgnoreCase(OUTPUT_FORMAT_ONE_LINE)) + return new OneLineWikipediaPageWriter(outputPath); + } + return new XmlWikipediaPageWriter(outputPath); } /** @@ -283,7 +290,7 @@ public static void convert( * @param numThreads the number of threads to use * @param batchsize the size of a batch * @param filters All filters to append - * @param outputFormat format of output i.e. xml or plain-text + * @param outputFormat format of output i.e. xml, plain-text or one-line */ public static void convert( TemplateConfig config,