pom.xmldependency1.01.11.21.31.3.1
<dependencies>
<dependency>
<groupId>org.apdplat</groupId>
<artifactId>word</artifactId>
<version>1.3</version>
</dependency>
</dependencies>
word 1.3.1ForElasticsearch1.7.2 lucene4.10.4solr4.10.4elasticsearch1.7.2
demo-word.bat
: command [text] [input] [output]
commanddemotextfile
demo
text APDPlat
file d:/text.txt d:/word.txt
exit
List<Word> words = WordSegmenter.seg("APDPlat");
List<Word> words = WordSegmenter.segWithStopWords("APDPlat");
System.out.println(words);
[, apdplat, , , , ]
[, , apdplat, , , , , ]
String input = "d:/text.txt";
String output = "d:/word.txt";
WordSegmenter.seg(new File(input), new File(output));
WordSegmenter.segWithStopWords(new File(input), new File(output));
word.confword-x.x.jar
word.local.conf
UTF-8
UTF-8
classpath:
WordConfTools.set("dic.path", "classpath:dic.txtd:/custom_dic");
DictionaryFactory.reload();//
Java
java -Ddic.path=classpath:dic.txtd:/custom_dic
word.local.conf
dic.path=classpath:dic.txtd:/custom_dic
dic.txt
, , :
//
//
DictionaryFactory.getDictionary().add("");
//
DictionaryFactory.getDictionary().remove("");
//
List<String> words = new ArrayList<>();
words.add("");
words.add("");
words.add("");
//
DictionaryFactory.getDictionary().addAll(words);
//
DictionaryFactory.getDictionary().removeAll(words);
stopwords.path=classpath:stopwords.txtd:/custom_stopwords_dic
classpath:dic.txtclasspath:custom_dic_dir,
d:/dic_more.txtd:/DIC_DIRD:/DIC2_DIRmy_dic_dirmy_dic_file.txt
classpath:stopwords.txtclasspath:custom_stopwords_dic_dir
d:/stopwords_more.txtd:/STOPWORDS_DIRd:/STOPWORDS2_DIRstopwords_dirremove.txt
WordSegmenter.seg("APDPlat", SegmentationAlgorithm.BidirectionalMaximumMatching);
SegmentationAlgorithm
MaximumMatching
ReverseMaximumMatching
MinimumMatching
ReverseMinimumMatching
BidirectionalMaximumMatching
BidirectionalMinimumMatching
BidirectionalMaximumMinimumMatching
FullSegmentation
MinimalWordCount
NgramMaxNgramScore
evaluation.bat
253 37092837 4490
target/evaluation
corpus-text.txt
test-text.txtcorpus-text.txt
standard-text.txt
result-text-***.txt***word
perfect-result-***.txt***
wrong-result-***.txt***
1word.confword.local.conf*.pathHTTPredis.*
#
dic.path=http://localhost:8080/word_web/resources/dic.txt
#
part.of.speech.dic.path=http://localhost:8080/word_web/resources/part_of_speech_dic.txt
#
part.of.speech.des.path=http://localhost:8080/word_web/resources/part_of_speech_des.txt
#
bigram.path=http://localhost:8080/word_web/resources/bigram.txt
#
trigram.path=http://localhost:8080/word_web/resources/trigram.txt
#
stopwords.path=http://localhost:8080/word_web/resources/stopwords.txt
#
punctuation.path=http://localhost:8080/word_web/resources/punctuation.txt
#
surname.path=http://localhost:8080/word_web/resources/surname.txt
#
quantifier.path=http://localhost:8080/word_web/resources/quantifier.txt
#redisHTTP
redis.enable=false
#redisHTTP
#redis
redis.host=localhost
#redis
redis.port=6379
2redis
redis, redis,
3HTTPwebhttps://github.com/ysc/word_web tomcat8080
// ""
http://localhost:8080/word_web/admin/dic.jsp?action=add&dic=
// ""
http://localhost:8080/word_web/admin/dic.jsp?action=remove&dic=
dic.jspredis, redis
PartOfSpeechTaggingprocessWordpartOfSpeech
List<Word> words = WordSegmenter.segWithStopWords("");
System.out.println(""+words);
//
PartOfSpeechTagging.process(words);
System.out.println(""+words);
[, , ]
[/r, /v, /ns]
List<Word> words = WordSegmenter.segWithStopWords("");
System.out.println(words);
[, , , , , , , , , , , , ]
[, , , , , , , , , , , , , , ]
word.refine.pathclasspath:word_refine.txt
=
=
refine
words = WordRefiner.refine(words);
System.out.println(words);
[, , , , , , , , , , , , , , ]
List<Word> words = WordSegmenter.segWithStopWords("");
System.out.println(words);
[, , , , , , , , , , , ]
[, , , , , , , , , ]
,
word.refine.pathclasspath:word_refine.txt
=
=
refine
words = WordRefiner.refine(words);
System.out.println(words);
[, , , , , , , , , ]
List<Word> words = WordSegmenter.segWithStopWords("");
System.out.println(words);
[, , , , , ]
SynonymTagging.process(words);
System.out.println(words);
[, [, , , ], , , , []]
SynonymTagging.process(words, false);
System.out.println(words);
[, [, , , ], , , , [, ]]
List<Word> words = WordSegmenter.segWithStopWords("");
System.out.println(words);
[, , , , , , ]
SynonymTagging.process(words);
System.out.println(words);
[, , , [], [, , ], , [, ]]
SynonymTagging.process(words, false);
System.out.println(words);
[, , , [], [, , , , , , , , , , , , , , , , , , , , , ], , [, ]]
WordgetSynonym()
System.out.println(word.getSynonym());
[, , , ]
getSynonym()Collections.emptyList()
ABACBDCE
AA B C
BA B D
CA C E
A B CA B C D E
List<Word> words = WordSegmenter.segWithStopWords("5");
System.out.println(words);
[5, , , , , , ]
AntonymTagging.process(words);
System.out.println(words);
[5, [, , ], , , , , ]
List<Word> words = WordSegmenter.segWithStopWords(",,");
System.out.println(words);
[, , , , , , , , , , , , , , , , , , , , , , , , ]
AntonymTagging.process(words);
System.out.println(words);
[, , , , , , , , , , , , , , , , , , , , [, , , ], , , , [, , , , , ]]
WordgetAntonym()
System.out.println(word.getAntonym());
[, , ]
getAntonym()Collections.emptyList()
List<Word> words = WordSegmenter.segWithStopWords("741220");
System.out.println(words);
[, , , 7, , , , , , 4, 12, , , , , , , , 20, ]
PinyinTagging.process(words);
System.out.println(words);
[ sd sudu, y yu, jq jiqing, 7, d de, zg zhongguo, nd neidi, pf piaofang, z zi, 4, 12, sy shangying, yl yilai, z zai, dd duanduan, lz liangzhou, n nei, tp tupo, 20, rmb renminbi]
WordgetFullPinYin()sudu
WordgetAcronymPinYin()sd
1wordChineseWordAnalyzer
Analyzer analyzer = new ChineseWordAnalyzer();
Analyzer analyzer = new ChineseWordAnalyzer(SegmentationAlgorithm.FullSegmentation);
SegmentationAlgorithm.BidirectionalMaximumMatching
SegmentationAlgorithm
2word
TokenStream tokenStream = analyzer.tokenStream("text", "APDPlat");
//
tokenStream.reset();
//
while(tokenStream.incrementToken()){
//
CharTermAttribute charTermAttribute = tokenStream.getAttribute(CharTermAttribute.class);
//
OffsetAttribute offsetAttribute = tokenStream.getAttribute(OffsetAttribute.class);
//
PositionIncrementAttribute positionIncrementAttribute = tokenStream.getAttribute(PositionIncrementAttribute.class);
LOGGER.info(charTermAttribute.toString()+" ("+offsetAttribute.startOffset()+" - "+offsetAttribute.endOffset()+") "+positionIncrementAttribute.getPositionIncrement());
}
//
tokenStream.close();
3wordLucene
Directory directory = new RAMDirectory();
IndexWriterConfig config = new IndexWriterConfig(analyzer);
IndexWriter indexWriter = new IndexWriter(directory, config);
4wordLucene
QueryParser queryParser = new QueryParser("text", analyzer);
Query query = queryParser.parse("text:");
TopDocs docs = indexSearcher.search(query, Integer.MAX_VALUE);
1word-1.3.jar
http://search.maven.org/remotecontent?filepath=org/apdplat/word/1.3/word-1.3.jar
2solr-5.2.0/example/solr/libword-1.3.jarlib
3schema
solr-5.2.0/example/solr/collection1/conf/schema.xml
<tokenizer class="solr.WhitespaceTokenizerFactory"/>
<tokenizer class="solr.StandardTokenizerFactory"/>
<tokenizer class="org.apdplat.word.solr.ChineseWordTokenizerFactory"/>
filter
4
<tokenizer class="org.apdplat.word.solr.ChineseWordTokenizerFactory" segAlgorithm="ReverseMinimumMatching"/>
segAlgorithm
MaximumMatching
ReverseMaximumMatching
MinimumMatching
ReverseMinimumMatching
BidirectionalMaximumMatching
BidirectionalMinimumMatching
BidirectionalMaximumMinimumMatching
FullSegmentation
MinimalWordCount
NgramMaxNgramScore
BidirectionalMaximumMatching
5
<tokenizer class="org.apdplat.word.solr.ChineseWordTokenizerFactory" segAlgorithm="ReverseMinimumMatching"
conf="solr-5.2.0/example/solr/nutch/conf/word.local.conf"/>
word.local.conf word-1.3.jar word.conf
word-1.3.jar word.conf
1elasticsearch
cd elasticsearch-5.4.3
2word
wget http://apdplat.org/word/archive/v1.4.1.zip
mkdir plugins/word
unzip -d plugins/word v1.4.1.zip
elasticsearch5.4.35.6.4plugins/word/plugin-descriptor.propertieselasticsearch.version=5.6.4
3ElasticSearch
bin/elasticsearch
4Chrome
http://localhost:9200/_analyze?analyzer=word&text=APDPlat
1http://luke.googlecode.com/files/lukeall-4.0.0-ALPHA.jar
2Javaword-1.0-bin.ziphttp://pan.baidu.com/s/1dDziDFz
3 Javaword-1.0-bin/word-1.0 4jar
winrarlukeall-4.0.0-ALPHA.jarMETA-INF.jar
.bat.htmlword.local.conflukeall-4.0.0-ALPHA.jar
4 java -jar lukeall-4.0.0-ALPHA.jar lukeSearchAnalysis
org.apdplat.word.lucene.ChineseWordAnalyzer
5PluginsAvailable analyzers found on the current classpath
org.apdplat.word.lucene.ChineseWordAnalyzer
wordmvn install
mvn dependency:copy-dependenciesjartarget/dependency/
jartarget/dependency/slf4j-api-1.6.4.jarword
target/dependency/logback-classic-0.9.28.jar
target/dependency/logback-core-0.9.28.jarword
target/classes/logback.xmltarget/word-1.3.jarwordjar
target/classes/word.conf
Lukelucene4.0.0 lukeall-4.0.0-ALPHA-with-word-1.0.jar
Lukelucene4.10.3lukeall-4.10.3-with-word-1.2.jar
NN
1
2
1wordword
demo-word-vector-corpus.bat demo-word-vector-corpus.sh
2word
demo-word-vector-file.bat demo-word-vector-file.sh
demo-word-vector-corpus.sh
sa=cos
1sa=cos
2sa=edi
3sa=euc
4sa=sim
5sa=jacJaccard
6sa=man
7sa=shhSimHash +
8sa=jaJaro
9sa=jawJaroWinkler
10sa=sdSrensenDice
limit=15
exit
EditDistanceTextSimilarity
----------------------------------------------------------
1 1.0
2 0.21
3 0.2
4 0.19
5 0.17
6 0.17
7 0.17
8 0.17
9 0.16
10 0.15
11 0.14
12 0.14
13 0.14
14 0.13
15 0.13
----------------------------------------------------------
1
1/N
: [ 1.0, 0.78205127, 0.7692308, 0.42307693, 0.41025642, 0.3846154, 0.32051283, 0.2948718, 0.2820513, 0.26923078, 0.23076923, 0.21794872, 0.20512821, 0.20512821, 0.20512821, 0.17948718, 0.15384616, 0.15384616, 0.15384616, 0.14102565, 0.14102565, 0.12820514, 0.12820514, 0.12820514, 0.12820514, 0.115384616, 0.102564104, 0.102564104, 0.102564104, 0.102564104]
: [ 1.0, 0.7119143, 0.19384204, 0.17831326, 0.16385542, 0.1394913, 0.13226238, 0.12717536, 0.11700134, 0.1145917, 0.11218206, 0.10200803, 0.08299866, 0.07951807, 0.06961178, 0.06827309, 0.066398926, 0.063453816, 0.06184739, 0.059973225, 0.05863454, 0.057563588, 0.056492638, 0.055421688, 0.05381526, 0.053547524, 0.053547524, 0.05274431, 0.052208837, 0.05167336]
: [ 1.0, 0.46666667, 0.45555556, 0.2962963, 0.2777778, 0.27407408, 0.24814814, 0.23333333, 0.22222222, 0.21851853, 0.2074074, 0.19259259, 0.19259259, 0.18518518, 0.18148148, 0.17037037, 0.16296296, 0.14074074, 0.14074074, 0.12962963, 0.12962963, 0.12222222, 0.12222222, 0.11851852, 0.11851852, 0.11481482, 0.11481482, 0.11481482, 0.11111111, 0.11111111]
: [ 1.0, 0.6136364, 0.39772728, 0.3409091, 0.26136363, 0.25, 0.23863636, 0.22727273, 0.1590909, 0.1590909, 0.14772727, 0.13636364, 0.13636364, 0.13636364, 0.125, 0.125, 0.11363637, 0.11363637, 0.10227273, 0.09090909, 0.09090909, 0.09090909, 0.09090909, 0.07954545, 0.07954545, 0.07954545, 0.06818182, 0.06818182, 0.06818182, 0.06818182]
: [ 1.0, 0.4117647, 0.1875, 0.17830883, 0.17463236, 0.17095588, 0.15441176, 0.15441176, 0.14338236, 0.1360294, 0.13419117, 0.121323526, 0.1194853, 0.11764706, 0.0992647, 0.09375, 0.09191176, 0.090073526, 0.08455882, 0.080882356, 0.07904412, 0.075367644, 0.073529415, 0.07169118, 0.07169118, 0.0680147, 0.060661763, 0.060661763, 0.05882353, 0.05882353]
: [ 1.0, 0.75, 0.7058824, 0.5882353, 0.5882353, > 0.5588235, 0.5294118, 0.50735295, 0.5, 0.5, 0.5, 0.49264705, 0.4852941, 0.4632353, 0.44117647, 0.42647058, 0.4117647, 0.4117647, 0.3602941, 0.3602941, 0.34558824, 0.3382353, 0.31617647, 0.31617647, 0.30882353, 0.29411766, 0.2867647, 0.2867647, 0.2720588, 0.25]
: [ 1.0, 0.41584158, 0.36138615, 0.25742576, 0.23762377, 0.20792079, 0.18811882, 0.18316832, 0.17821783, 0.17326732, 0.15841584, 0.15346535, 0.14356436, 0.12871288, 0.12376238, 0.12376238, 0.11386139, 0.10891089, 0.0990099, > 0.08415841, 0.07920792, 0.07920792, 0.074257426, 0.06930693, 0.06930693, 0.06930693, 0.06930693, 0.06930693, 0.06930693, 0.06435644]
: [ 1.0, 0.8, 0.62222224, 0.54444444, 0.36666667, 0.31111112, 0.26666668, 0.18888889, 0.17777778, 0.15555556, > 0.14444445, 0.11111111, 0.11111111, 0.11111111, 0.11111111, 0.11111111, 0.11111111, 0.08888889, 0.08888889, 0.08888889, 0.08888889, 0.08888889, 0.08888889, 0.07777778, 0.07777778, 0.07777778, 0.07777778, 0.07777778, 0.06666667, 0.06666667]
: [ 1.0, 0.73333335, 0.46666667, 0.43333334, 0.4, 0.4, 0.4, 0.33333334, 0.26666668, 0.26666668, 0.26666668, 0.26666668, 0.26666668, 0.23333333, 0.2, 0.2, 0.2, 0.2, 0.16666667, 0.16666667, 0.16666667, 0.16666667, 0.16666667, 0.13333334, 0.13333334, 0.13333334, 0.13333334, 0.13333334, 0.13333334, 0.13333334]
: [ 1.0, 0.8, 0.4, 0.4, 0.3, 0.2, 0.2, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, > 0.1, 0.1, 0.1, 0.1]
: [ 1.0, 0.5123967, 0.3181818, 0.27272728, 0.23553719, 0.20661157, 0.18595041, 0.15289256, 0.14876033, 0.14049587, 0.14049587, 0.1322314, 0.12809917, > 0.12809917, 0.12809917, 0.12396694, 0.11570248, 0.11570248, 0.11157025, 0.11157025, 0.10743801, 0.10330579, 0.10330579, 0.09917355, 0.09090909, 0.08677686, 0.08677686, 0.08264463, 0.08264463, 0.08264463]
: [ 1.0, 0.8181818, 0.53333336, 0.4848485, 0.4121212, 0.38787878, 0.36969697, 0.36363637, 0.34545454, 0.3030303, 0.2969697, 0.28484848, 0.27272728, 0.27272728, 0.26666668, 0.24242425, 0.24242425, 0.24242425, 0.24242425, 0.23636363, 0.21818182, 0.21818182, 0.21818182, 0.21212122, 0.2060606, 0.2060606, 0.2060606, 0.19393939, 0.18181819, 0.16969697]
: [ 1.0, 0.52380955, 0.5, 0.33333334, 0.33333334, 0.23809524, 0.23809524, 0.1904762, 0.1904762, 0.1904762, 0.16666667, 0.14285715, 0.14285715, 0.14285715, 0.14285715, 0.14285715, 0.14285715, 0.14285715, 0.11904762, 0.0952381, 0.0952381, 0.0952381, 0.0952381, 0.0952381, 0.0952381, 0.0952381, 0.0952381, 0.0952381, 0.0952381, 0.071428575]
: [ 1.0, 0.91935486, 0.7580645, 0.61290324, 0.58064514, 0.5645161, 0.5483871, 0.48387095, 0.4032258, 0.38709676, 0.37096775, 0.3548387, 0.32258064, 0.30645162, 0.2580645, 0.2580645, 0.22580644, 0.22580644, 0.22580644, 0.20967741, 0.19354838, 0.19354838, 0.19354838, 0.17741935, 0.17741935, 0.17741935, 0.17741935, 0.17741935, 0.17741935, 0.16129032]
: [ 1.0, 0.8235294, 0.3529412, 0.3529412, 0.29411766, 0.29411766, 0.23529412, 0.23529412, 0.1764706, 0.1764706, 0.1764706, 0.1764706, 0.14705883, 0.14705883, 0.11764706, 0.11764706, 0.11764706, 0.11764706, 0.11764706, 0.0882353, 0.0882353, 0.0882353, 0.0882353, 0.0882353, 0.0882353, 0.05882353, 0.05882353, 0.05882353, 0.05882353, 0.05882353]
7
----------------------------------------------------------
CosineTextSimilarity
1 1.0
2 0.5
3 0.47
4 0.46
5 0.46
6 0.46
7 0.46
8 0.42
9 0.42
10 0.42
11 0.39
12 0.38
13 0.38
14 0.37
15 0.37
25,572
----------------------------------------------------------
EditDistanceTextSimilarity
1 1.0
2 0.21
3 0.2
4 0.19
5 0.17
6 0.17
7 0.17
8 0.17
9 0.16
10 0.15
11 0.14
12 0.14
13 0.14
14 0.13
15 0.13
44,253
----------------------------------------------------------
EuclideanDistanceTextSimilarity
1 1.0
2 0.37
3 0.37
4 0.37
5 0.37
6 0.37
7 0.37
8 0.36
9 0.36
10 0.36
11 0.36
12 0.36
13 0.36
14 0.36
15 0.36
24,710
----------------------------------------------------------
SimpleTextSimilarity
1 1.0
2 0.36
3 0.33
4 0.33
5 0.33
6 0.32
7 0.32
8 0.3
9 0.3
10 0.3
11 0.29
12 0.29
13 0.29
14 0.28
15 0.28
21,918
----------------------------------------------------------
JaccardTextSimilarity
1 1.0
2 0.22
3 0.2
4 0.18
5 0.18
6 0.18
7 0.18
8 0.15
9 0.15
10 0.15
11 0.15
12 0.15
13 0.15
14 0.15
15 0.13
19,717
----------------------------------------------------------
ManhattanDistanceTextSimilarity
1 1.0
2 0.11
3 0.11
4 0.11
5 0.11
6 0.11
7 0.11
8 0.11
9 0.11
10 0.11
11 0.11
12 0.11
13 0.1
14 0.1
15 0.1
23,857
----------------------------------------------------------
SimHashPlusHammingDistanceTextSimilarity
1 1.0
2 0.96
3 0.95
4 0.95
5 0.95
6 0.95
7 0.95
8 0.95
9 0.94
10 0.94
11 0.94
12 0.94
13 0.94
14 0.94
15 0.94
5,57,339
----------------------------------------------------------
JaroDistanceTextSimilarity
1 1.0
2 0.49
3 0.49
4 0.48
5 0.47
6 0.46
7 0.46
8 0.45
9 0.45
10 0.45
11 0.45
12 0.45
13 0.44
14 0.44
15 0.44
12,718
----------------------------------------------------------
JaroWinklerDistanceTextSimilarity
1 1.0
2 0.56
3 0.55
4 0.55
5 0.54
6 0.53
7 0.53
8 0.53
9 0.52
10 0.52
11 0.52
12 0.51
13 0.51
14 0.51
15 0.51
16,723
----------------------------------------------------------
SrensenDiceCoefficientTextSimilarity
1 1.0
2 0.37
3 0.33
4 0.3
5 0.3
6 0.3
7 0.3
8 0.27
9 0.27
10 0.27
11 0.27
12 0.27
13 0.27
14 0.27
15 0.23
19,852
----------------------------------------------------------
org.apdplat.word.WordFrequencyStatistics
text.txt
chmod +x wfs.sh & wfs.sh -textFile=text.txt -statisticsResultFile=statistics-result.txt
statistics-result.txt
//
WordFrequencyStatistics wordFrequencyStatistics = new WordFrequencyStatistics();
wordFrequencyStatistics.setRemoveStopWord(false);
wordFrequencyStatistics.setResultPath("word-frequency-statistics.txt");
wordFrequencyStatistics.setSegmentationAlgorithm(SegmentationAlgorithm.MaxNgramScore);
//
wordFrequencyStatistics.seg("");
//
wordFrequencyStatistics.dump();
//
Files.write(Paths.get("text-to-seg.txt"), Arrays.asList("wordJavangram"));
//
wordFrequencyStatistics.reset();
//
wordFrequencyStatistics.seg(new File("text-to-seg.txt"), new File("text-seg-result.txt"));
//
wordFrequencyStatistics.dump("file-seg-statistics-result.txt");
1 2
2 2
3 2
4 1
5 1
6 1
7 1
8 1
9 1
10 1
11 1
12 1
13 1
14 1
15 1
1 2
2 2
3 1
4word 1
5 1
6 1
7ngram 1
8 1
9 1
10 1
11 1
12 1
13 1
14 1
15 1
16 1
17 1
18 1
19 1
20 1
21 1
22Java 1
word
org.apdplat.word.analysis.CosineTextSimilarity
String text1 = "";
String text2 = "";
String text3 = "";
TextSimilarity textSimilarity = new CosineTextSimilarity();
double score1pk1 = textSimilarity.similarScore(text1, text1);
double score1pk2 = textSimilarity.similarScore(text1, text2);
double score1pk3 = textSimilarity.similarScore(text1, text3);
double score2pk2 = textSimilarity.similarScore(text2, text2);
double score2pk3 = textSimilarity.similarScore(text2, text3);
double score3pk3 = textSimilarity.similarScore(text3, text3);
System.out.println(text1+" "+text1+" "+score1pk1);
System.out.println(text1+" "+text2+" "+score1pk2);
System.out.println(text1+" "+text3+" "+score1pk3);
System.out.println(text2+" "+text2+" "+score2pk2);
System.out.println(text2+" "+text3+" "+score2pk3);
System.out.println(text3+" "+text3+" "+score3pk3);
1.0
0.67
0.0
1.0
0.0
1.0
org.apdplat.word.analysis.SimpleTextSimilarity
String text1 = "";
String text2 = "";
String text3 = "";
TextSimilarity textSimilarity = new SimpleTextSimilarity();
double score1pk1 = textSimilarity.similarScore(text1, text1);
double score1pk2 = textSimilarity.similarScore(text1, text2);
double score1pk3 = textSimilarity.similarScore(text1, text3);
double score2pk2 = textSimilarity.similarScore(text2, text2);
double score2pk3 = textSimilarity.similarScore(text2, text3);
double score3pk3 = textSimilarity.similarScore(text3, text3);
System.out.println(text1+" "+text1+" "+score1pk1);
System.out.println(text1+" "+text2+" "+score1pk2);
System.out.println(text1+" "+text3+" "+score1pk3);
System.out.println(text2+" "+text2+" "+score2pk2);
System.out.println(text2+" "+text3+" "+score2pk3);
System.out.println(text3+" "+text3+" "+score3pk3);
1.0
0.5
0.0
1.0
0.0
1.0
org.apdplat.word.analysis.EditDistanceTextSimilarity
String text1 = "";
String text2 = "";
String text3 = "";
TextSimilarity textSimilarity = new EditDistanceTextSimilarity();
double score1pk1 = textSimilarity.similarScore(text1, text1);
double score1pk2 = textSimilarity.similarScore(text1, text2);
double score1pk3 = textSimilarity.similarScore(text1, text3);
double score2pk2 = textSimilarity.similarScore(text2, text2);
double score2pk3 = textSimilarity.similarScore(text2, text3);
double score3pk3 = textSimilarity.similarScore(text3, text3);
System.out.println(text1+" "+text1+" "+score1pk1);
System.out.println(text1+" "+text2+" "+score1pk2);
System.out.println(text1+" "+text3+" "+score1pk3);
System.out.println(text2+" "+text2+" "+score2pk2);
System.out.println(text2+" "+text3+" "+score2pk3);
System.out.println(text3+" "+text3+" "+score3pk3);
1.0
0.5
0.0
1.0
0.0
1.0
SimHash + SimHash
org.apdplat.word.analysis.SimHashPlusHammingDistanceTextSimilarity
String text1 = "";
String text2 = "";
String text3 = "";
TextSimilarity textSimilarity = new SimHashPlusHammingDistanceTextSimilarity();
double score1pk1 = textSimilarity.similarScore(text1, text1);
double score1pk2 = textSimilarity.similarScore(text1, text2);
double score1pk3 = textSimilarity.similarScore(text1, text3);
double score2pk2 = textSimilarity.similarScore(text2, text2);
double score2pk3 = textSimilarity.similarScore(text2, text3);
double score3pk3 = textSimilarity.similarScore(text3, text3);
System.out.println(text1+" "+text1+" "+score1pk1);
System.out.println(text1+" "+text2+" "+score1pk2);
System.out.println(text1+" "+text3+" "+score1pk3);
System.out.println(text2+" "+text2+" "+score2pk2);
System.out.println(text2+" "+text3+" "+score2pk3);
System.out.println(text3+" "+text3+" "+score3pk3);
1.0
0.95
0.83
1.0
0.86
1.0
JaccardJaccard similarity coefficient
org.apdplat.word.analysis.JaccardTextSimilarity
String text1 = "";
String text2 = "";
String text3 = "";
TextSimilarity textSimilarity = new JaccardTextSimilarity();
double score1pk1 = textSimilarity.similarScore(text1, text1);
double score1pk2 = textSimilarity.similarScore(text1, text2);
double score1pk3 = textSimilarity.similarScore(text1, text3);
double score2pk2 = textSimilarity.similarScore(text2, text2);
double score2pk3 = textSimilarity.similarScore(text2, text3);
double score3pk3 = textSimilarity.similarScore(text3, text3);
System.out.println(text1+" "+text1+" "+score1pk1);
System.out.println(text1+" "+text2+" "+score1pk2);
System.out.println(text1+" "+text3+" "+score1pk3);
System.out.println(text2+" "+text2+" "+score2pk2);
System.out.println(text2+" "+text3+" "+score2pk3);
System.out.println(text3+" "+text3+" "+score3pk3);
1.0
0.5
0.0
1.0
0.0
1.0
Euclidean Distance
org.apdplat.word.analysis.EuclideanDistanceTextSimilarity
String text1 = "";
String text2 = "";
String text3 = "";
TextSimilarity textSimilarity = new EuclideanDistanceTextSimilarity();
double score1pk1 = textSimilarity.similarScore(text1, text1);
double score1pk2 = textSimilarity.similarScore(text1, text2);
double score1pk3 = textSimilarity.similarScore(text1, text3);
double score2pk2 = textSimilarity.similarScore(text2, text2);
double score2pk3 = textSimilarity.similarScore(text2, text3);
double score3pk3 = textSimilarity.similarScore(text3, text3);
System.out.println(text1+" "+text1+" "+score1pk1);
System.out.println(text1+" "+text2+" "+score1pk2);
System.out.println(text1+" "+text3+" "+score1pk3);
System.out.println(text2+" "+text2+" "+score2pk2);
System.out.println(text2+" "+text3+" "+score2pk3);
System.out.println(text3+" "+text3+" "+score3pk3);
1.0
0.41
0.29
1.0
0.29
1.0
Manhattan Distance
org.apdplat.word.analysis.ManhattanDistanceTextSimilarity
String text1 = "";
String text2 = "";
String text3 = "";
TextSimilarity textSimilarity = new ManhattanDistanceTextSimilarity();
double score1pk1 = textSimilarity.similarScore(text1, text1);
double score1pk2 = textSimilarity.similarScore(text1, text2);
double score1pk3 = textSimilarity.similarScore(text1, text3);
double score2pk2 = textSimilarity.similarScore(text2, text2);
double score2pk3 = textSimilarity.similarScore(text2, text3);
double score3pk3 = textSimilarity.similarScore(text3, text3);
System.out.println(text1+" "+text1+" "+score1pk1);
System.out.println(text1+" "+text2+" "+score1pk2);
System.out.println(text1+" "+text3+" "+score1pk3);
System.out.println(text2+" "+text2+" "+score2pk2);
System.out.println(text2+" "+text3+" "+score2pk3);
System.out.println(text3+" "+text3+" "+score3pk3);
1.0
0.33
0.14
1.0
0.14
1.0
JaroJaro Distance
org.apdplat.word.analysis.JaroDistanceTextSimilarity
String text1 = "";
String text2 = "";
String text3 = "";
TextSimilarity textSimilarity = new JaroDistanceTextSimilarity();
double score1pk1 = textSimilarity.similarScore(text1, text1);
double score1pk2 = textSimilarity.similarScore(text1, text2);
double score1pk3 = textSimilarity.similarScore(text1, text3);
double score2pk2 = textSimilarity.similarScore(text2, text2);
double score2pk3 = textSimilarity.similarScore(text2, text3);
double score3pk3 = textSimilarity.similarScore(text3, text3);
System.out.println(text1+" "+text1+" "+score1pk1);
System.out.println(text1+" "+text2+" "+score1pk2);
System.out.println(text1+" "+text3+" "+score1pk3);
System.out.println(text2+" "+text2+" "+score2pk2);
System.out.println(text2+" "+text3+" "+score2pk3);
System.out.println(text3+" "+text3+" "+score3pk3);
1.0
0.67
0.0
1.0
0.0
1.0
JaroWinklerJaroWinkler DistanceJaro
org.apdplat.word.analysis.JaroWinklerDistanceTextSimilarity
String text1 = "";
String text2 = "";
String text3 = "";
TextSimilarity textSimilarity = new JaroWinklerDistanceTextSimilarity();
double score1pk1 = textSimilarity.similarScore(text1, text1);
double score1pk2 = textSimilarity.similarScore(text1, text2);
double score1pk3 = textSimilarity.similarScore(text1, text3);
double score2pk2 = textSimilarity.similarScore(text2, text2);
double score2pk3 = textSimilarity.similarScore(text2, text3);
double score3pk3 = textSimilarity.similarScore(text3, text3);
System.out.println(text1+" "+text1+" "+score1pk1);
System.out.println(text1+" "+text2+" "+score1pk2);
System.out.println(text1+" "+text3+" "+score1pk3);
System.out.println(text2+" "+text2+" "+score2pk2);
System.out.println(text2+" "+text3+" "+score2pk3);
System.out.println(text3+" "+text3+" "+score3pk3);
1.0
0.73
0.0
1.0
0.0
1.0
SrensenDiceSrensenDice coefficient2
org.apdplat.word.analysis.SrensenDiceCoefficientTextSimilarity
String text1 = "";
String text2 = "";
String text3 = "";
TextSimilarity textSimilarity = new SrensenDiceCoefficientTextSimilarity();
double score1pk1 = textSimilarity.similarScore(text1, text1);
double score1pk2 = textSimilarity.similarScore(text1, text2);
double score1pk3 = textSimilarity.similarScore(text1, text3);
double score2pk2 = textSimilarity.similarScore(text2, text2);
double score2pk3 = textSimilarity.similarScore(text2, text3);
double score3pk3 = textSimilarity.similarScore(text3, text3);
System.out.println(text1+" "+text1+" "+score1pk1);
System.out.println(text1+" "+text2+" "+score1pk2);
System.out.println(text1+" "+text3+" "+score1pk3);
System.out.println(text2+" "+text2+" "+score2pk2);
System.out.println(text2+" "+text3+" "+score2pk3);
System.out.println(text3+" "+text3+" "+score3pk3);
1.0
0.67
0.0
1.0
0.0
1.0
:
unix-like:
chmod +x sentence-identify.sh & ./sentence-identify.sh
windows:
./sentence-identify.bat
org.apdplat.word.analysis.SentenceIdentify :
1. : , : 0.71428573
2. : , : 0.6666667
3. : , : 0.5
4. : , : 0.5
5. : , : 0.2857143
6. : , : 0.2857143
7. : , : 0.25
8. : , : 0.22222222
9. : , : 0.2
10. : , : 0.2
:
:
: [, , , , , ]
:
: 1.0
:
:
: [, , , , , , , , , , , , ]
:
: 0.8333333
1word Ngram
370.9714 /
66.55% 33.44% 2533709 1686210 847499
60.94% 39.05% 28374490 17293964 11080526
2word
330.1586 /
65.67% 34.32% 2533709 1663958 869751
60.12% 39.87% 28374490 17059641 11314849
3word
62.960262 /
57.2% 42.79% 2533709 1449288 1084421
47.95% 52.04% 28374490 13605742 14768748
4word
462.87158 /
53.06% 46.93% 2533709 1344624 1189085
43.07% 56.92% 28374490 12221610 16152880
5word
967.68604 /
46.34% 53.65% 2533709 1174276 1359433
36.07% 63.92% 28374490 10236574 18137916
6word
661.148 /
46.18% 53.81% 2533709 1170075 1363634
35.65% 64.34% 28374490 10117122 18257368
7word
1567.1318 /
41.88% 58.11% 2533709 1061189 1472520
31.35% 68.64% 28374490 8896173 19478317
8word
1232.6017 /
41.69% 58.3% 2533709 1056515 1477194
30.98% 69.01% 28374490 8792532 19581958
9word
1936.9575 /
41.42% 58.57% 2533709 1049673 1484036
31.34% 68.65% 28374490 8893622 19480868
10word
2228.9465 /
36.7% 63.29% 2533709 930069 1603640
26.72% 73.27% 28374490 7583741 20790749