Files

48 lines
1.5 KiB
Java

package dualist.pipes;
import cc.mallet.pipe.Pipe;
import cc.mallet.extract.StringSpan;
import cc.mallet.extract.StringTokenization;
import cc.mallet.types.Instance;
import cc.mallet.types.TokenSequence;
import org.chasen.mecab.Tagger;
import org.chasen.mecab.Node;
public class SimpleMecabPipe extends Pipe
{
static {
try {
System.loadLibrary("mecab-java");
} catch (UnsatisfiedLinkError e) {
System.err.println("ERROR: Failed to load mecab-java native code.");
System.err.println(e);
System.exit(1);
}
}
public Instance pipe (Instance carrier)
{
CharSequence input = (CharSequence) carrier.getData();
String string = input.toString();
Tagger tagger = new Tagger();
Node node = tagger.parseToNode(string);
int cursor = 0;
TokenSequence ts = new StringTokenization(input);
while (node != null) {
node = node.getNext();
if (node == null) break;
String[] f = node.getFeature().split(",");
if (f[0].equals("名詞") &&
!f[1].equals("") && !f[1].equals("サ変接続") && !f[1].equals("接尾") ||
f[0].equals("未知語")) {
String surface = node.getSurface();
cursor = string.indexOf(surface, cursor);
ts.add (new StringSpan(input, cursor, cursor + surface.length()));
}
}
carrier.setData(ts);
return carrier;
}
}