Author: simonw
Date: Wed Nov 25 15:57:11 2009
New Revision: 884162

URL: http://svn.apache.org/viewvc?rev=884162&view=rev
Log:
ORP-2: Added support for hamshahri collection with approximately 160,000 persian documents.

Added:
lucene/openrelevance/trunk/collections/hamshahri/
lucene/openrelevance/trunk/collections/hamshahri/build.xml (with props)
lucene/openrelevance/trunk/collections/hamshahri/src/
lucene/openrelevance/trunk/collections/hamshahri/src/java/
lucene/openrelevance/trunk/collections/hamshahri/src/java/org/
lucene/openrelevance/trunk/collections/hamshahri/src/java/org/apache/
lucene/openrelevance/trunk/collections/hamshahri/src/java/org/apache/or/
lucene/openrelevance/trunk/collections/hamshahri/src/java/org/apache/or/collections/
lucene/openrelevance/trunk/collections/hamshahri/src/java/org/apache/or/collections/hamshahri/
lucene/openrelevance/trunk/collections/hamshahri/src/java/org/apache/or/collections/hamshahri/HamshahriCorpusConverter.java (with props)
lucene/openrelevance/trunk/collections/hamshahri/src/java/org/apache/or/collections/hamshahri/HamshahriQrelConverter.java (with props)
lucene/openrelevance/trunk/collections/hamshahri/src/java/org/apache/or/collections/hamshahri/HamshahriTopicConverter.java (with props)
lucene/openrelevance/trunk/collections/hamshahri/src/java/org/apache/or/collections/hamshahri/HamshahriTopicConverter2.java (with props)
Modified:
lucene/openrelevance/trunk/README.txt

Modified: lucene/openrelevance/trunk/README.txt
URL: http://svn.apache.org/viewvc/lucene/openrelevance/trunk/README.txt?rev=884162&r1=884161&r2=884162&view=diff
==============================================================================
--- lucene/openrelevance/trunk/README.txt (original)
+++ lucene/openrelevance/trunk/README.txt Wed Nov 25 15:57:11 2009
@@ -20,6 +20,7 @@
content.source.log.step=2500
doc.term.vector=false
content.source.forever=false
+content.source.encoding=UTF-8
directory=FSDirectory
doc.stored=true
doc.tokenized=true
@@ -37,7 +38,7 @@
This will create an index in contrib/benchmark/work/index

Step 4:
- java -cp lucene-core-3.0-dev.jar;lucene-benchmark-3.0-dev.jar queries.txt judgements.txt submission.txt contrib/benchmark/work/index
+ java -Dfile.encoding=UTF-8 -cp lucene-core-3.0-dev.jar:lucene-benchmark-3.0-dev.jar org.apache.lucene.benchmark.quality.trec.QueryDriver queries.txt judgements.txt submission.txt contrib/benchmark/work/index

This will print a bunch of information, finally a summary output.


Added: lucene/openrelevance/trunk/collections/hamshahri/build.xml
URL: http://svn.apache.org/viewvc/lucene/openrelevance/trunk/collections/hamshahri/build.xml?rev=884162&view=auto
==============================================================================
--- lucene/openrelevance/trunk/collections/hamshahri/build.xml (added)
+++ lucene/openrelevance/trunk/collections/hamshahri/build.xml Wed Nov 25 15:57:11 2009
@@ -0,0 +1,103 @@
+<?xml version="1.0"?>
+
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+ -->
+
+<project name="hamshahri" default="dist">
+
+ <import file="../collections-build.xml"/>
+
+ <property name="Hamshahri-Corpus.zip"
+ location="${build.dir}/download/Hamshahri-Corpus.zip"/>
+ <available file="${Hamshahri-Corpus.zip}" property="corpus.exists"/>
+
+ <property name="Hamshahri-Query_Judgement.zip"
+ location="${build.dir}/download/Hamshahri-Query_Judgement.zip"/>
+ <available file="${Hamshahri-Query_Judgement.zip}" property="judgements.exists"/>
+
+ <property name="Hamshahri-Query_Judgement_old.zip"
+ location="${build.dir}/download/Hamshahri-Query_Judgement_old.zip"/>
+ <available file="${Hamshahri-Query_Judgement_old.zip}"
+ property="judgements2.exists"/>
+
+ <target name="fetch-corpus" unless="corpus.exists">
+ <mkdir dir="${build.dir}/download"/>
+ <get src="http://ece.ut.ac.ir/dbrg/Hamshahri/Corpus/Hamshahri-Corpus.zip"
+ dest="${Hamshahri-Corpus.zip}"/>
+ </target>
+
+ <target name="fetch-judgements" unless="judgements.exists">
+ <mkdir dir="${build.dir}/download"/>
+ <get
+ src="http://ece.ut.ac.ir/dbrg/Hamshahri/Corpus/Hamshahri-Query_Judgement.zip"
+ dest="${Hamshahri-Query_Judgement.zip}"/>
+ </target>
+
+ <target name="fetch-judgements2" unless="judgements2.exists">
+ <mkdir dir="${build.dir}/download"/>
+ <get
+ src="http://ece.ut.ac.ir/dbrg/Hamshahri/Corpus/Hamshahri-Query_Judgement_old.zip"
+ dest="${Hamshahri-Query_Judgement_old.zip}"/>
+ </target>
+
+ <target name="fetch" depends="fetch-corpus,fetch-judgements,fetch-judgements2"/>
+
+ <target name="extract" depends="fetch">
+ <unzip src="${Hamshahri-Corpus.zip}" dest="${build.dir}/extracted" />
+ <unzip src="${Hamshahri-Query_Judgement.zip}" dest="${build.dir}/extracted" />
+ <unzip src="${Hamshahri-Query_Judgement_old.zip}"
+ dest="${build.dir}/extracted" />
+ </target>
+
+ <target name="dist" depends="compile,extract">
+ <mkdir dir="${dist.dir}"/>
+ <java classname="org.apache.or.collections.hamshahri.HamshahriCorpusConverter">
+ <arg value="${build.dir}/extracted/Hamshahri-Corpus.txt"/>
+ <arg value="${dist.dir}/corpus.gz"/>
+ <classpath refid="classpath"/>
+ </java>
+
+ <!-- Two sets of corresponding queries and judgements
+ The "trec-formatted" one J2/Q2 is judgements.txt/queries.txt
+ respectively.
+ The "old" Queries/Judgement become judgements2.txt/queries2.txt
+ -->
+
+ <java classname="org.apache.or.collections.hamshahri.HamshahriQrelConverter">
+ <arg value="${build.dir}/extracted/J2.txt"/>
+ <arg value="${dist.dir}/judgements.txt"/>
+ <classpath refid="classpath"/>
+ </java>
+ <java classname="org.apache.or.collections.hamshahri.HamshahriQrelConverter">
+ <arg value="${build.dir}/extracted/Judgement.txt"/>
+ <arg value="${dist.dir}/judgements2.txt"/>
+ <classpath refid="classpath"/>
+ </java>
+
+ <java classname="org.apache.or.collections.hamshahri.HamshahriTopicConverter">
+ <arg value="${build.dir}/extracted/Q2.txt"/>
+ <arg value="${dist.dir}/queries.txt"/>
+ <classpath refid="classpath"/>
+ </java>
+ <java classname="org.apache.or.collections.hamshahri.HamshahriTopicConverter2">
+ <arg value="${build.dir}/extracted/Queries.txt"/>
+ <arg value="${dist.dir}/queries2.txt"/>
+ <classpath refid="classpath"/>
+ </java>
+ </target>
+
+</project>

Propchange: lucene/openrelevance/trunk/collections/hamshahri/build.xml
------------------------------------------------------------------------------
svn:eol-style = native

Added: lucene/openrelevance/trunk/collections/hamshahri/src/java/org/apache/or/collections/hamshahri/HamshahriCorpusConverter.java
URL: http://svn.apache.org/viewvc/lucene/openrelevance/trunk/collections/hamshahri/src/java/org/apache/or/collections/hamshahri/HamshahriCorpusConverter.java?rev=884162&view=auto
==============================================================================
--- lucene/openrelevance/trunk/collections/hamshahri/src/java/org/apache/or/collections/hamshahri/HamshahriCorpusConverter.java (added)
+++ lucene/openrelevance/trunk/collections/hamshahri/src/java/org/apache/or/collections/hamshahri/HamshahriCorpusConverter.java Wed Nov 25 15:57:11 2009
@@ -0,0 +1,79 @@
+package org.apache.or.collections.hamshahri;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.BufferedReader;
+import java.io.File;
+import java.io.FileInputStream;
+import java.io.InputStreamReader;
+import java.util.Date;
+import java.util.regex.Pattern;
+
+import org.apache.or.util.TrecDocument;
+import org.apache.or.util.TrecDocumentWriter;
+
+/**
+ * Converts the Hamshahri corpus into a standard format.
+ *
+ * TODO: There is a date, but it is ignored in parsing.
+ * There is also a category for each document, it is ignored too.
+ */
+public class HamshahriCorpusConverter {
+ static Pattern didPattern = Pattern.compile("^\\.DID\\s*.*$");
+
+ public static void main(String args[]) throws Exception {
+ BufferedReader in = new BufferedReader(new InputStreamReader(
+ new FileInputStream(args[0]), "UTF-8"));
+ TrecDocumentWriter writer = new TrecDocumentWriter(new File(args[1]));
+ TrecDocument doc = new TrecDocument();
+
+ String line = null;
+ String did = null;
+ Date date = new Date(); // this corpus has dates, but use a fake one.
+ StringBuilder body = new StringBuilder();
+
+ while ((line = in.readLine()) != null) {
+ if (didPattern.matcher(line).matches()) {
+ if (did != null) {
+ doc.setDocname(did);
+ doc.setBody(body);
+ doc.setDate(date);
+ writer.write(doc);
+ }
+ body.setLength(0);
+ String didTokens[] = line.split("\\s+"); // .DID <identifier>
+ /*
+ * doc ids in the corpus are in upper case.
+ * doc ids in all the judgements are in lower case!
+ */
+ did = didTokens[1].toLowerCase();
+ in.readLine(); // ignore the date for now
+ in.readLine(); // also ignore the category for now
+ } else {
+ body.append(line);
+ body.append('\n');
+ }
+ }
+ // the last document
+ doc.setDocname(did);
+ doc.setBody(body);
+ doc.setDate(date);
+ writer.write(doc);
+ writer.close();
+ }
+}

Propchange: lucene/openrelevance/trunk/collections/hamshahri/src/java/org/apache/or/collections/hamshahri/HamshahriCorpusConverter.java
------------------------------------------------------------------------------
svn:eol-style = native

Added: lucene/openrelevance/trunk/collections/hamshahri/src/java/org/apache/or/collections/hamshahri/HamshahriQrelConverter.java
URL: http://svn.apache.org/viewvc/lucene/openrelevance/trunk/collections/hamshahri/src/java/org/apache/or/collections/hamshahri/HamshahriQrelConverter.java?rev=884162&view=auto
==============================================================================
--- lucene/openrelevance/trunk/collections/hamshahri/src/java/org/apache/or/collections/hamshahri/HamshahriQrelConverter.java (added)
+++ lucene/openrelevance/trunk/collections/hamshahri/src/java/org/apache/or/collections/hamshahri/HamshahriQrelConverter.java Wed Nov 25 15:57:11 2009
@@ -0,0 +1,50 @@
+package org.apache.or.collections.hamshahri;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.BufferedReader;
+import java.io.File;
+import java.io.FileInputStream;
+import java.io.InputStreamReader;
+
+import org.apache.or.util.TrecQrel;
+import org.apache.or.util.TrecQrelWriter;
+
+/**
+ * Converts the Hamshahri relevance judgements into a standard format.
+ */
+public class HamshahriQrelConverter {
+ public static void main(String args[]) throws Exception {
+ BufferedReader in = new BufferedReader(new InputStreamReader(
+ new FileInputStream(args[0]), "UTF-8"));
+ TrecQrelWriter writer = new TrecQrelWriter(new File(args[1]));
+ TrecQrel qrel = new TrecQrel();
+
+ String line = null;
+ while ((line = in.readLine()) != null) {
+ String parsed[] = line.split("\\s+");
+ qrel.setQid(parsed[0]);
+ qrel.setIter("0");
+ // should already be in lowercase, just in case.
+ qrel.setDocno(parsed[1].toLowerCase());
+ qrel.setRel(Integer.parseInt(parsed[2]));
+ writer.write(qrel);
+ }
+ writer.close();
+ }
+}

Propchange: lucene/openrelevance/trunk/collections/hamshahri/src/java/org/apache/or/collections/hamshahri/HamshahriQrelConverter.java
------------------------------------------------------------------------------
svn:eol-style = native

Added: lucene/openrelevance/trunk/collections/hamshahri/src/java/org/apache/or/collections/hamshahri/HamshahriTopicConverter.java
URL: http://svn.apache.org/viewvc/lucene/openrelevance/trunk/collections/hamshahri/src/java/org/apache/or/collections/hamshahri/HamshahriTopicConverter.java?rev=884162&view=auto
==============================================================================
--- lucene/openrelevance/trunk/collections/hamshahri/src/java/org/apache/or/collections/hamshahri/HamshahriTopicConverter.java (added)
+++ lucene/openrelevance/trunk/collections/hamshahri/src/java/org/apache/or/collections/hamshahri/HamshahriTopicConverter.java Wed Nov 25 15:57:11 2009
@@ -0,0 +1,75 @@
+package org.apache.or.collections.hamshahri;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.BufferedReader;
+import java.io.File;
+import java.io.FileInputStream;
+import java.io.InputStreamReader;
+
+import org.apache.or.util.TrecTopic;
+import org.apache.or.util.TrecTopicWriter;
+
+/**
+ * Converts the Hamshahri topics into a standard format.
+ */
+public class HamshahriTopicConverter {
+ public static void main(String args[]) throws Exception {
+ BufferedReader in = new BufferedReader(new InputStreamReader(
+ new FileInputStream(args[0]), "UTF-8"));
+ TrecTopicWriter writer = new TrecTopicWriter(new File(args[1]));
+ TrecTopic topic = new TrecTopic();;
+
+ String line = null;
+ while ((line = in.readLine()) != null) {
+ if (line.length() == 0)
+ continue;
+ if (line.equals("<QID>")) {
+ String number = in.readLine();
+ topic.setNumber(number);
+ in.readLine(); // </QID>
+ in.readLine(); // <title>
+ StringBuilder sb = new StringBuilder();
+ while (!(line = in.readLine()).equals("</title>")) {
+ if (sb.length() > 0)
+ sb.append(" ");
+ sb.append(line);
+ }
+ topic.setTitle(sb.toString());
+ in.readLine(); // <description>
+ sb.setLength(0);
+ while (!(line = in.readLine()).equals("</description>")) {
+ if (sb.length() > 0)
+ sb.append(" ");
+ sb.append(line);
+ }
+ topic.setDescription(sb.toString());
+ in.readLine(); // <narrative>
+ sb.setLength(0);
+ while (!(line = in.readLine()).equals("</narrative>")) {
+ if (sb.length() > 0)
+ sb.append(" ");
+ sb.append(line);
+ }
+ topic.setNarrative(sb.toString());
+ writer.write(topic);
+ }
+ }
+ writer.close();
+ }
+}

Propchange: lucene/openrelevance/trunk/collections/hamshahri/src/java/org/apache/or/collections/hamshahri/HamshahriTopicConverter.java
------------------------------------------------------------------------------
svn:eol-style = native

Added: lucene/openrelevance/trunk/collections/hamshahri/src/java/org/apache/or/collections/hamshahri/HamshahriTopicConverter2.java
URL: http://svn.apache.org/viewvc/lucene/openrelevance/trunk/collections/hamshahri/src/java/org/apache/or/collections/hamshahri/HamshahriTopicConverter2.java?rev=884162&view=auto
==============================================================================
--- lucene/openrelevance/trunk/collections/hamshahri/src/java/org/apache/or/collections/hamshahri/HamshahriTopicConverter2.java (added)
+++ lucene/openrelevance/trunk/collections/hamshahri/src/java/org/apache/or/collections/hamshahri/HamshahriTopicConverter2.java Wed Nov 25 15:57:11 2009
@@ -0,0 +1,70 @@
+package org.apache.or.collections.hamshahri;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.BufferedReader;
+import java.io.File;
+import java.io.FileInputStream;
+import java.io.InputStreamReader;
+
+import org.apache.or.util.TrecTopic;
+import org.apache.or.util.TrecTopicWriter;
+
+/**
+ * Converts the old Hamshahri topics into a standard format.
+ * This format is slightly different than the "new" one!
+ * For example, there is title and narrative, but not description.
+ */
+public class HamshahriTopicConverter2 {
+ public static void main(String args[]) throws Exception {
+ BufferedReader in = new BufferedReader(new InputStreamReader(
+ new FileInputStream(args[0]), "UTF-8"));
+ TrecTopicWriter writer = new TrecTopicWriter(new File(args[1]));
+ TrecTopic topic = new TrecTopic();;
+
+ String line = null;
+ while ((line = in.readLine()) != null) {
+ if (line.length() == 0)
+ continue;
+ if (line.equals("<qid>")) {
+ String number = in.readLine();
+ topic.setNumber(number);
+ in.readLine(); // </qid>
+ in.readLine(); // <title>
+ StringBuilder sb = new StringBuilder();
+ while (!(line = in.readLine()).equals("</title>")) {
+ if (sb.length() > 0)
+ sb.append(" ");
+ sb.append(line);
+ }
+ topic.setTitle(sb.toString());
+ topic.setDescription(""); // no description
+ in.readLine(); // <narrative>
+ sb.setLength(0);
+ while (!(line = in.readLine()).equals("</narrative>")) {
+ if (sb.length() > 0)
+ sb.append(" ");
+ sb.append(line);
+ }
+ topic.setNarrative(sb.toString());
+ writer.write(topic);
+ }
+ }
+ writer.close();
+ }
+}

Propchange: lucene/openrelevance/trunk/collections/hamshahri/src/java/org/apache/or/collections/hamshahri/HamshahriTopicConverter2.java
------------------------------------------------------------------------------
svn:eol-style = native

Search Discussions

Related Discussions

Discussion Navigation
viewthread | post
Discussion Overview
groupopenrelevance-dev @
categorieslucene
postedNov 25, '09 at 3:57p
activeNov 25, '09 at 3:57p
posts1
users1
websitelucene.apache.org...

1 user in discussion

Simonw: 1 post

People

Translate

site design / logo © 2018 Grokbase