You've already forked macports-ports
mirror of
https://github.com/encounter/macports-ports.git
synced 2026-03-30 11:29:27 -07:00
ab16d51d56
git-svn-id: https://svn.macports.org/repository/macports/trunk/dports@90590 d073be05-634f-4543-b044-5fe20cf6d1d6
48 lines
1.5 KiB
Java
48 lines
1.5 KiB
Java
package dualist.pipes;
|
|
|
|
import cc.mallet.pipe.Pipe;
|
|
import cc.mallet.extract.StringSpan;
|
|
import cc.mallet.extract.StringTokenization;
|
|
import cc.mallet.types.Instance;
|
|
import cc.mallet.types.TokenSequence;
|
|
|
|
import org.chasen.mecab.Tagger;
|
|
import org.chasen.mecab.Node;
|
|
|
|
public class SimpleMecabPipe extends Pipe
|
|
{
|
|
static {
|
|
try {
|
|
System.loadLibrary("mecab-java");
|
|
} catch (UnsatisfiedLinkError e) {
|
|
System.err.println("ERROR: Failed to load mecab-java native code.");
|
|
System.err.println(e);
|
|
System.exit(1);
|
|
}
|
|
}
|
|
|
|
public Instance pipe (Instance carrier)
|
|
{
|
|
CharSequence input = (CharSequence) carrier.getData();
|
|
String string = input.toString();
|
|
Tagger tagger = new Tagger();
|
|
Node node = tagger.parseToNode(string);
|
|
int cursor = 0;
|
|
TokenSequence ts = new StringTokenization(input);
|
|
while (node != null) {
|
|
node = node.getNext();
|
|
if (node == null) break;
|
|
String[] f = node.getFeature().split(",");
|
|
if (f[0].equals("名詞") &&
|
|
!f[1].equals("数") && !f[1].equals("サ変接続") && !f[1].equals("接尾") ||
|
|
f[0].equals("未知語")) {
|
|
String surface = node.getSurface();
|
|
cursor = string.indexOf(surface, cursor);
|
|
ts.add (new StringSpan(input, cursor, cursor + surface.length()));
|
|
}
|
|
}
|
|
carrier.setData(ts);
|
|
return carrier;
|
|
}
|
|
}
|