diff --git a/src/main/java/se/lth/cs/nlp/io/OneLineWikipediaPageWriter.java b/src/main/java/se/lth/cs/nlp/io/OneLineWikipediaPageWriter.java
new file mode 100644
index 0000000..03ae0d3
--- /dev/null
+++ b/src/main/java/se/lth/cs/nlp/io/OneLineWikipediaPageWriter.java
@@ -0,0 +1,89 @@
+package se.lth.cs.nlp.io;
+
+import se.lth.cs.nlp.mediawiki.model.WikipediaPage;
+import se.lth.cs.nlp.pipeline.Sink;
+
+import java.io.File;
+import java.io.IOError;
+import java.io.IOException;
+import java.nio.ByteBuffer;
+import java.nio.channels.FileChannel;
+import java.nio.file.Paths;
+import java.nio.file.StandardOpenOption;
+import java.util.List;
+
+/**
+ * This file/class is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This file/class is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this forked version of wikiforia.
+ * If not, see .
+ *
+ * @author Anton Södergren - karl.aj.sodergren@gmail.com
+ */
+public class OneLineWikipediaPageWriter implements Sink {
+
+ private final File output;
+ private FileChannel fileChannel;
+
+ /**
+ * Default constructor
+ *
+ * @param output which file to write to
+ */
+ public OneLineWikipediaPageWriter(File output) {
+ try {
+ this.output = output;
+
+ //Fix so it doesn't crash on file not exists
+ File f = new File(output.toURI());
+ f.createNewFile();
+
+ this.fileChannel = FileChannel.open(Paths.get(output.toURI()), StandardOpenOption.WRITE, StandardOpenOption.TRUNCATE_EXISTING);
+ } catch (IOException e) {
+ throw new IOError(e);
+ }
+ }
+
+ @Override
+ public synchronized void process(List batch) {
+ if (this.fileChannel == null)
+ return;
+
+ try {
+ if (batch.size() == 0) {
+ this.fileChannel.write(ByteBuffer.wrap("\n".getBytes("utf-8")));
+ this.fileChannel.close();
+ this.fileChannel = null;
+ return;
+ }
+
+ for (WikipediaPage wikipediaPage : batch) {
+ String text = wikipediaPage.getText();
+ if (text.length() > 0) {
+ //Make it one line
+ text = text.replaceAll("\n", " ");
+ //Append an id to the start of the line. {{page:id}}
+ this.fileChannel.write(ByteBuffer.wrap(wikipediaPage.getSignature().getBytes("utf-8")));
+ this.fileChannel.write(ByteBuffer.wrap(text.getBytes("utf-8")));
+ this.fileChannel.write(ByteBuffer.wrap("\n".getBytes("utf-8")));
+ }
+ }
+ } catch (IOException e) {
+ throw new IOError(e);
+ }
+ }
+
+ @Override
+ public String toString() {
+ return String.format("XML Writer { target: %s }", output.getAbsolutePath());
+ }
+}
diff --git a/src/main/java/se/lth/cs/nlp/mediawiki/model/WikipediaPage.java b/src/main/java/se/lth/cs/nlp/mediawiki/model/WikipediaPage.java
index 92b1a28..151d5a0 100644
--- a/src/main/java/se/lth/cs/nlp/mediawiki/model/WikipediaPage.java
+++ b/src/main/java/se/lth/cs/nlp/mediawiki/model/WikipediaPage.java
@@ -30,4 +30,6 @@ public WikipediaPage(Page page, String text) {
public String getText() {
return text;
}
+
+ public String getSignature() { return "{{page:"+this.getId()+"}}"; }
}
diff --git a/src/main/java/se/lth/cs/nlp/wikiforia/App.java b/src/main/java/se/lth/cs/nlp/wikiforia/App.java
index 1225cd4..8e04747 100644
--- a/src/main/java/se/lth/cs/nlp/wikiforia/App.java
+++ b/src/main/java/se/lth/cs/nlp/wikiforia/App.java
@@ -20,6 +20,7 @@
import org.apache.commons.lang3.StringUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
+import se.lth.cs.nlp.io.OneLineWikipediaPageWriter;
import se.lth.cs.nlp.io.PlainTextWikipediaPageWriter;
import se.lth.cs.nlp.io.SimpleHadoopTextWriter;
import se.lth.cs.nlp.io.XmlWikipediaPageWriter;
@@ -102,7 +103,7 @@ public class App
@SuppressWarnings("static-access")
private static final Option output = OptionBuilder.withLongOpt("output")
- .withDescription("xml output filepath")
+ .withDescription("output filepath")
.hasArg()
.isRequired()
.withArgName("path")
@@ -117,13 +118,14 @@ public class App
@SuppressWarnings("static-access")
private static final Option outputFormatOption = OptionBuilder.withLongOpt("output-format")
- .withDescription("Output format : xml or plain-text")
+ .withDescription("Output format : xml, plain-text or one-line")
.hasArg()
.withArgName("outputformat")
- .create("outputformat");
+ .create("of");
private static final String OUTPUT_FORMAT_XML = "xml";
private static final String OUTPUT_FORMAT_PLAIN_TEXT = "plain-text";
+ private static final String OUTPUT_FORMAT_ONE_LINE = "one-line";
private static final String OUTPUT_FORMAT_DEFAULT = OUTPUT_FORMAT_XML;
/**
@@ -248,8 +250,13 @@ public static void convert(
* @return Sink
*/
private static Sink getSink(String outputFormat, File outputPath) {
- return outputFormat != null && outputFormat.trim().equalsIgnoreCase(OUTPUT_FORMAT_PLAIN_TEXT)
- ? new PlainTextWikipediaPageWriter(outputPath) : new XmlWikipediaPageWriter(outputPath);
+ if (outputFormat != null) {
+ if (outputFormat.trim().equalsIgnoreCase(OUTPUT_FORMAT_PLAIN_TEXT))
+ return new PlainTextWikipediaPageWriter(outputPath);
+ if (outputFormat.trim().equalsIgnoreCase(OUTPUT_FORMAT_ONE_LINE))
+ return new OneLineWikipediaPageWriter(outputPath);
+ }
+ return new XmlWikipediaPageWriter(outputPath);
}
/**
@@ -283,7 +290,7 @@ public static void convert(
* @param numThreads the number of threads to use
* @param batchsize the size of a batch
* @param filters All filters to append
- * @param outputFormat format of output i.e. xml or plain-text
+ * @param outputFormat format of output i.e. xml, plain-text or one-line
*/
public static void convert(
TemplateConfig config,