word

Java分布式中文分词组件 - word分词

APACHE-2.0 License

Stars
1.8K
Committers
5

Java - word

wordJavangramrefine1010LuceneSolrElasticSearchLukeword1.3JDK1.8

API

word 1.0 API

word 1.1 API

word 1.2 API

word 1.3 API

jar : ku8i

Maven

pom.xmldependency1.01.11.21.31.3.1

<dependencies>
	<dependency>
		<groupId>org.apdplat</groupId>
		<artifactId>word</artifactId>
		<version>1.3</version>
	</dependency>
</dependencies>

word 1.3.1ForElasticsearch1.7.2 lucene4.10.4solr4.10.4elasticsearch1.7.2

1

demo-word.bat
: command [text] [input] [output]
commanddemotextfile
demo
text APDPlat
file d:/text.txt d:/word.txt
exit

2

List<Word> words = WordSegmenter.seg("APDPlat");
List<Word> words = WordSegmenter.segWithStopWords("APDPlat");
			System.out.println(words);


[, apdplat, , , , ]
[, , apdplat, , , , , ]

3

String input = "d:/text.txt";
String output = "d:/word.txt";
WordSegmenter.seg(new File(input), new File(output));
WordSegmenter.segWithStopWords(new File(input), new File(output));

4

word.confword-x.x.jar
word.local.conf

UTF-8

5

UTF-8


classpath:
	

	
		WordConfTools.set("dic.path", "classpath:dic.txtd:/custom_dic");
		DictionaryFactory.reload();//
	Java
		java -Ddic.path=classpath:dic.txtd:/custom_dic
	
		word.local.conf
		dic.path=classpath:dic.txtd:/custom_dic

dic.txt

, , :

// 
// 
DictionaryFactory.getDictionary().add("");
// 
DictionaryFactory.getDictionary().remove("");
// 
List<String> words = new ArrayList<>();
words.add("");
words.add("");
words.add("");
// 
DictionaryFactory.getDictionary().addAll(words);
// 
DictionaryFactory.getDictionary().removeAll(words);

6

stopwords.path=classpath:stopwords.txtd:/custom_stopwords_dic

7

classpath:dic.txtclasspath:custom_dic_dir,
d:/dic_more.txtd:/DIC_DIRD:/DIC2_DIRmy_dic_dirmy_dic_file.txt

classpath:stopwords.txtclasspath:custom_stopwords_dic_dir
d:/stopwords_more.txtd:/STOPWORDS_DIRd:/STOPWORDS2_DIRstopwords_dirremove.txt

8

WordSegmenter.seg("APDPlat", SegmentationAlgorithm.BidirectionalMaximumMatching);

SegmentationAlgorithm	 
MaximumMatching
ReverseMaximumMatching
MinimumMatching
ReverseMinimumMatching
BidirectionalMaximumMatching
BidirectionalMinimumMatching
BidirectionalMaximumMinimumMatching
FullSegmentation
MinimalWordCount
NgramMaxNgramScore

9

evaluation.bat
253 37092837 4490
target/evaluation
corpus-text.txt
test-text.txtcorpus-text.txt
standard-text.txt
result-text-***.txt***word
perfect-result-***.txt***
wrong-result-***.txt***

10

1word.confword.local.conf*.pathHTTPredis.*

    #
    dic.path=http://localhost:8080/word_web/resources/dic.txt
    #
    part.of.speech.dic.path=http://localhost:8080/word_web/resources/part_of_speech_dic.txt
    #
    part.of.speech.des.path=http://localhost:8080/word_web/resources/part_of_speech_des.txt
    #
    bigram.path=http://localhost:8080/word_web/resources/bigram.txt
    #
    trigram.path=http://localhost:8080/word_web/resources/trigram.txt
    #
    stopwords.path=http://localhost:8080/word_web/resources/stopwords.txt
    #
    punctuation.path=http://localhost:8080/word_web/resources/punctuation.txt
    #
    surname.path=http://localhost:8080/word_web/resources/surname.txt
    #
    quantifier.path=http://localhost:8080/word_web/resources/quantifier.txt

    #redisHTTP
    redis.enable=false
    #redisHTTP
    #redis
    redis.host=localhost
    #redis
    redis.port=6379

2redis

    redis, redis, 

3HTTPwebhttps://github.com/ysc/word_web tomcat8080

    // ""
    http://localhost:8080/word_web/admin/dic.jsp?action=add&dic=
    // ""
    http://localhost:8080/word_web/admin/dic.jsp?action=remove&dic=

    dic.jspredis, redis

11

PartOfSpeechTaggingprocessWordpartOfSpeech

List<Word> words = WordSegmenter.segWithStopWords("");
System.out.println(""+words);
//
PartOfSpeechTagging.process(words);
System.out.println(""+words);

[, , ]
[/r, /v, /ns]

12refine

List<Word> words = WordSegmenter.segWithStopWords("");
System.out.println(words);

[, , , , , , , , , , , , ]

[, , , , , , , , , , , , , , ]
  
word.refine.pathclasspath:word_refine.txt
= 
= 
refine
words = WordRefiner.refine(words);
System.out.println(words);

[, , , , , , , , , , , , , , ]


List<Word> words = WordSegmenter.segWithStopWords("");
System.out.println(words);

[, , , , , , , , , , , ]

[, , , , , , , , , ]
 , 
word.refine.pathclasspath:word_refine.txt
 =
 =
refine
words = WordRefiner.refine(words);
System.out.println(words);

[, , , , , , , , , ]

13

List<Word> words = WordSegmenter.segWithStopWords("");
System.out.println(words);

[, , , , , ]

SynonymTagging.process(words);
System.out.println(words);

[, [, , , ], , , , []]

SynonymTagging.process(words, false);
System.out.println(words);

[, [, , , ], , , , [, ]]

List<Word> words = WordSegmenter.segWithStopWords("");
System.out.println(words);

[, , , , , , ]

SynonymTagging.process(words);
System.out.println(words);

[, , , [], [, , ], , [, ]]

SynonymTagging.process(words, false);
System.out.println(words);

[, , , [], [, , , , , , , , , , , , , , , , , , , , , ], , [, ]]


WordgetSynonym()
System.out.println(word.getSynonym());

[, , , ]
getSynonym()Collections.emptyList()



ABACBDCE

AA B C
BA B D
CA C E
A B CA B C D E

14

List<Word> words = WordSegmenter.segWithStopWords("5");
System.out.println(words);

[5, , , , , , ]

AntonymTagging.process(words);
System.out.println(words);

[5, [, , ], , , , , ]

List<Word> words = WordSegmenter.segWithStopWords(",,");
System.out.println(words);

[, , , , , , , , , , , , , , , , , , , , , , , , ]

AntonymTagging.process(words);
System.out.println(words);

[, , , , , , , , , , , , , , , , , , , , [, , , ], , , , [, , , , , ]]


WordgetAntonym()
System.out.println(word.getAntonym());

[, , ]
getAntonym()Collections.emptyList()

15

List<Word> words = WordSegmenter.segWithStopWords("741220");
System.out.println(words);

[, , , 7, , , , , , 4, 12, , , , , , , , 20, ]

PinyinTagging.process(words);
System.out.println(words);

[ sd sudu,  y yu,  jq jiqing, 7,  d de,  zg zhongguo,  nd neidi,  pf piaofang,  z zi, 4, 12,  sy shangying,  yl yilai,  z zai,  dd duanduan,  lz liangzhou,  n nei,  tp tupo, 20,  rmb renminbi]


WordgetFullPinYin()sudu
WordgetAcronymPinYin()sd

16Lucene

1wordChineseWordAnalyzer
Analyzer analyzer = new ChineseWordAnalyzer();

Analyzer analyzer = new ChineseWordAnalyzer(SegmentationAlgorithm.FullSegmentation);
SegmentationAlgorithm.BidirectionalMaximumMatching
SegmentationAlgorithm

2word
TokenStream tokenStream = analyzer.tokenStream("text", "APDPlat");
//
tokenStream.reset();
//
while(tokenStream.incrementToken()){
	//
	CharTermAttribute charTermAttribute = tokenStream.getAttribute(CharTermAttribute.class);
	//
	OffsetAttribute offsetAttribute = tokenStream.getAttribute(OffsetAttribute.class);
	//
	PositionIncrementAttribute positionIncrementAttribute = tokenStream.getAttribute(PositionIncrementAttribute.class);
	
	LOGGER.info(charTermAttribute.toString()+" ("+offsetAttribute.startOffset()+" - "+offsetAttribute.endOffset()+") "+positionIncrementAttribute.getPositionIncrement());
}
//
tokenStream.close();

3wordLucene
Directory directory = new RAMDirectory();
IndexWriterConfig config = new IndexWriterConfig(analyzer);
IndexWriter indexWriter = new IndexWriter(directory, config);

4wordLucene
QueryParser queryParser = new QueryParser("text", analyzer);
Query query = queryParser.parse("text:");
TopDocs docs = indexSearcher.search(query, Integer.MAX_VALUE);

17Solr

1word-1.3.jar
http://search.maven.org/remotecontent?filepath=org/apdplat/word/1.3/word-1.3.jar

2solr-5.2.0/example/solr/libword-1.3.jarlib

3schema
solr-5.2.0/example/solr/collection1/conf/schema.xml
<tokenizer class="solr.WhitespaceTokenizerFactory"/>
<tokenizer class="solr.StandardTokenizerFactory"/>
<tokenizer class="org.apdplat.word.solr.ChineseWordTokenizerFactory"/>
filter

4
<tokenizer class="org.apdplat.word.solr.ChineseWordTokenizerFactory" segAlgorithm="ReverseMinimumMatching"/>
segAlgorithm	 
MaximumMatching
ReverseMaximumMatching
MinimumMatching
ReverseMinimumMatching
BidirectionalMaximumMatching
BidirectionalMinimumMatching
BidirectionalMaximumMinimumMatching
FullSegmentation
MinimalWordCount
NgramMaxNgramScore
BidirectionalMaximumMatching

5
<tokenizer class="org.apdplat.word.solr.ChineseWordTokenizerFactory" segAlgorithm="ReverseMinimumMatching"
		conf="solr-5.2.0/example/solr/nutch/conf/word.local.conf"/>
word.local.conf word-1.3.jar word.conf
 word-1.3.jar word.conf

18ElasticSearch

1elasticsearch
cd elasticsearch-5.4.3

2word
wget http://apdplat.org/word/archive/v1.4.1.zip
mkdir plugins/word
unzip -d plugins/word v1.4.1.zip
elasticsearch5.4.35.6.4plugins/word/plugin-descriptor.propertieselasticsearch.version=5.6.4
	
3ElasticSearch	
bin/elasticsearch

4Chrome
http://localhost:9200/_analyze?analyzer=word&text=APDPlat

19Luke

1http://luke.googlecode.com/files/lukeall-4.0.0-ALPHA.jar

2Javaword-1.0-bin.ziphttp://pan.baidu.com/s/1dDziDFz

3 Javaword-1.0-bin/word-1.0 4jar
winrarlukeall-4.0.0-ALPHA.jarMETA-INF.jar
.bat.htmlword.local.conflukeall-4.0.0-ALPHA.jar

4 java -jar lukeall-4.0.0-ALPHA.jar lukeSearchAnalysis
 org.apdplat.word.lucene.ChineseWordAnalyzer 

5PluginsAvailable analyzers found on the current classpath 
org.apdplat.word.lucene.ChineseWordAnalyzer 

wordmvn install
mvn dependency:copy-dependenciesjartarget/dependency/
jartarget/dependency/slf4j-api-1.6.4.jarword
target/dependency/logback-classic-0.9.28.jar
target/dependency/logback-core-0.9.28.jarword
target/classes/logback.xmltarget/word-1.3.jarwordjar
target/classes/word.conf

Lukelucene4.0.0 lukeall-4.0.0-ALPHA-with-word-1.0.jar

Lukelucene4.10.3lukeall-4.10.3-with-word-1.2.jar

20

NN






1
2






1wordword 
demo-word-vector-corpus.bat  demo-word-vector-corpus.sh
2word 
demo-word-vector-file.bat  demo-word-vector-file.sh


   
demo-word-vector-corpus.sh 



sa=cos
   1sa=cos
   2sa=edi
   3sa=euc
   4sa=sim
   5sa=jacJaccard
   6sa=man
   7sa=shhSimHash + 
   8sa=jaJaro
   9sa=jawJaroWinkler
   10sa=sdSrensenDice
limit=15
exit


  

 EditDistanceTextSimilarity
----------------------------------------------------------
	1 1.0
	2 0.21
	3 0.2
	4 0.19
	5 0.17
	6 0.17
	7 0.17
	8 0.17
	9 0.16
	10 0.15
	11 0.14
	12 0.14
	13 0.14
	14 0.13
	15 0.13
----------------------------------------------------------
  
   1







1/N


 : [ 1.0,  0.78205127,  0.7692308,  0.42307693,  0.41025642,  0.3846154,  0.32051283,  0.2948718,  0.2820513,  0.26923078,  0.23076923,  0.21794872,  0.20512821,  0.20512821,  0.20512821,  0.17948718,  0.15384616,  0.15384616,  0.15384616,  0.14102565,  0.14102565,  0.12820514,  0.12820514,  0.12820514,  0.12820514,  0.115384616,  0.102564104,  0.102564104,  0.102564104,  0.102564104]	
 : [ 1.0,  0.7119143,  0.19384204,  0.17831326,  0.16385542,  0.1394913,  0.13226238,  0.12717536,  0.11700134,  0.1145917,  0.11218206,  0.10200803,  0.08299866,  0.07951807,  0.06961178,  0.06827309,  0.066398926,  0.063453816,  0.06184739,  0.059973225,  0.05863454,  0.057563588,  0.056492638,  0.055421688,  0.05381526,  0.053547524,  0.053547524,  0.05274431,  0.052208837,  0.05167336]
 : [ 1.0,  0.46666667,  0.45555556,  0.2962963,  0.2777778,  0.27407408,  0.24814814,  0.23333333,  0.22222222,  0.21851853,  0.2074074,  0.19259259,  0.19259259,  0.18518518,  0.18148148,  0.17037037,  0.16296296,  0.14074074,  0.14074074,  0.12962963,  0.12962963,  0.12222222,  0.12222222,  0.11851852,  0.11851852,  0.11481482,  0.11481482,  0.11481482,  0.11111111,  0.11111111]	 
 : [ 1.0,  0.6136364,  0.39772728,  0.3409091,  0.26136363,  0.25,  0.23863636,  0.22727273,  0.1590909,  0.1590909,  0.14772727,  0.13636364,  0.13636364,  0.13636364,  0.125,  0.125,  0.11363637,  0.11363637,  0.10227273,  0.09090909,  0.09090909,  0.09090909,  0.09090909,  0.07954545,  0.07954545,  0.07954545,  0.06818182,  0.06818182,  0.06818182,  0.06818182]
 : [ 1.0,  0.4117647,  0.1875,  0.17830883,  0.17463236,  0.17095588,  0.15441176,  0.15441176,  0.14338236,  0.1360294,  0.13419117,  0.121323526,  0.1194853,  0.11764706,  0.0992647,  0.09375,  0.09191176,  0.090073526,  0.08455882,  0.080882356,  0.07904412,  0.075367644,  0.073529415,  0.07169118,  0.07169118,  0.0680147,  0.060661763,  0.060661763,  0.05882353,  0.05882353]
 : [ 1.0,  0.75,  0.7058824,  0.5882353,  0.5882353, > 0.5588235,  0.5294118,  0.50735295,  0.5,  0.5,  0.5,  0.49264705,  0.4852941,  0.4632353,  0.44117647,  0.42647058,  0.4117647,  0.4117647,  0.3602941,  0.3602941,  0.34558824,  0.3382353,  0.31617647,  0.31617647,  0.30882353,  0.29411766,  0.2867647,  0.2867647,  0.2720588,  0.25]	
 : [ 1.0,  0.41584158,  0.36138615,  0.25742576,  0.23762377,  0.20792079,  0.18811882,  0.18316832,  0.17821783,  0.17326732,  0.15841584,  0.15346535,  0.14356436,  0.12871288,  0.12376238,  0.12376238,  0.11386139,  0.10891089,  0.0990099, > 0.08415841,  0.07920792,  0.07920792,  0.074257426,  0.06930693,  0.06930693,  0.06930693,  0.06930693,  0.06930693,  0.06930693,  0.06435644]
 : [ 1.0,  0.8,  0.62222224,  0.54444444,  0.36666667,  0.31111112,  0.26666668,  0.18888889,  0.17777778,  0.15555556, > 0.14444445,  0.11111111,  0.11111111,  0.11111111,  0.11111111,  0.11111111,  0.11111111,  0.08888889,  0.08888889,  0.08888889,  0.08888889,  0.08888889,  0.08888889,  0.07777778,  0.07777778,  0.07777778,  0.07777778,  0.07777778,  0.06666667,  0.06666667]
 : [ 1.0,  0.73333335,  0.46666667,  0.43333334,  0.4,  0.4,  0.4,  0.33333334,  0.26666668,  0.26666668,  0.26666668,  0.26666668,  0.26666668,  0.23333333,  0.2,  0.2,  0.2,  0.2,  0.16666667,  0.16666667,  0.16666667,  0.16666667,  0.16666667,  0.13333334,  0.13333334,  0.13333334,  0.13333334,  0.13333334,  0.13333334,  0.13333334]
 : [ 1.0,  0.8,  0.4,  0.4,  0.3,  0.2,  0.2,  0.1,  0.1,  0.1,  0.1,  0.1,  0.1,  0.1,  0.1, > 0.1,  0.1,  0.1,  0.1]
 : [ 1.0,  0.5123967,  0.3181818,  0.27272728,  0.23553719,  0.20661157,  0.18595041,  0.15289256,  0.14876033,  0.14049587,  0.14049587,  0.1322314,  0.12809917, > 0.12809917,  0.12809917,  0.12396694,  0.11570248,  0.11570248,  0.11157025,  0.11157025,  0.10743801,  0.10330579,  0.10330579,  0.09917355,  0.09090909,  0.08677686,  0.08677686,  0.08264463,  0.08264463,  0.08264463]	
 : [ 1.0,  0.8181818,  0.53333336,  0.4848485,  0.4121212,  0.38787878,  0.36969697,  0.36363637,  0.34545454,  0.3030303,  0.2969697,  0.28484848,  0.27272728,  0.27272728,  0.26666668,  0.24242425,  0.24242425,  0.24242425,  0.24242425,  0.23636363,  0.21818182,  0.21818182,  0.21818182,  0.21212122,  0.2060606,  0.2060606,  0.2060606,  0.19393939,  0.18181819,  0.16969697]
 : [ 1.0,  0.52380955,  0.5,  0.33333334,  0.33333334,  0.23809524,  0.23809524,  0.1904762,  0.1904762,  0.1904762,  0.16666667,  0.14285715,  0.14285715,  0.14285715,  0.14285715,  0.14285715,  0.14285715,  0.14285715,  0.11904762,  0.0952381,  0.0952381,  0.0952381,  0.0952381,  0.0952381,  0.0952381,  0.0952381,  0.0952381,  0.0952381,  0.0952381,  0.071428575]
 : [ 1.0,  0.91935486,  0.7580645,  0.61290324,  0.58064514,  0.5645161,  0.5483871,  0.48387095,  0.4032258,  0.38709676,  0.37096775,  0.3548387,  0.32258064,  0.30645162,  0.2580645,  0.2580645,  0.22580644,  0.22580644,  0.22580644,  0.20967741,  0.19354838,  0.19354838,  0.19354838,  0.17741935,  0.17741935,  0.17741935,  0.17741935,  0.17741935,  0.17741935,  0.16129032]
 : [ 1.0,  0.8235294,  0.3529412,  0.3529412,  0.29411766,  0.29411766,  0.23529412,  0.23529412,  0.1764706,  0.1764706,  0.1764706,  0.1764706,  0.14705883,  0.14705883,  0.11764706,  0.11764706,  0.11764706,  0.11764706,  0.11764706,  0.0882353,  0.0882353,  0.0882353,  0.0882353,  0.0882353,  0.0882353,  0.05882353,  0.05882353,  0.05882353,  0.05882353,  0.05882353]	

7  

----------------------------------------------------------
 CosineTextSimilarity
	1 1.0
	2 0.5
	3 0.47
	4 0.46
	5 0.46
	6 0.46
	7 0.46
	8 0.42
	9 0.42
	10 0.42
	11 0.39
	12 0.38
	13 0.38
	14 0.37
	15 0.37
25,572
----------------------------------------------------------
 EditDistanceTextSimilarity
	1 1.0
	2 0.21
	3 0.2
	4 0.19
	5 0.17
	6 0.17
	7 0.17
	8 0.17
	9 0.16
	10 0.15
	11 0.14
	12 0.14
	13 0.14
	14 0.13
	15 0.13
44,253
----------------------------------------------------------
 EuclideanDistanceTextSimilarity
	1 1.0
	2 0.37
	3 0.37
	4 0.37
	5 0.37
	6 0.37
	7 0.37
	8 0.36
	9 0.36
	10 0.36
	11 0.36
	12 0.36
	13 0.36
	14 0.36
	15 0.36
24,710
----------------------------------------------------------
 SimpleTextSimilarity
	1 1.0
	2 0.36
	3 0.33
	4 0.33
	5 0.33
	6 0.32
	7 0.32
	8 0.3
	9 0.3
	10 0.3
	11 0.29
	12 0.29
	13 0.29
	14 0.28
	15 0.28
21,918
----------------------------------------------------------
 JaccardTextSimilarity
	1 1.0
	2 0.22
	3 0.2
	4 0.18
	5 0.18
	6 0.18
	7 0.18
	8 0.15
	9 0.15
	10 0.15
	11 0.15
	12 0.15
	13 0.15
	14 0.15
	15 0.13
19,717
----------------------------------------------------------
 ManhattanDistanceTextSimilarity
	1 1.0
	2 0.11
	3 0.11
	4 0.11
	5 0.11
	6 0.11
	7 0.11
	8 0.11
	9 0.11
	10 0.11
	11 0.11
	12 0.11
	13 0.1
	14 0.1
	15 0.1
23,857
----------------------------------------------------------
 SimHashPlusHammingDistanceTextSimilarity
	1 1.0
	2 0.96
	3 0.95
	4 0.95
	5 0.95
	6 0.95
	7 0.95
	8 0.95
	9 0.94
	10 0.94
	11 0.94
	12 0.94
	13 0.94
	14 0.94
	15 0.94
5,57,339
----------------------------------------------------------
 JaroDistanceTextSimilarity
	1 1.0
	2 0.49
	3 0.49
	4 0.48
	5 0.47
	6 0.46
	7 0.46
	8 0.45
	9 0.45
	10 0.45
	11 0.45
	12 0.45
	13 0.44
	14 0.44
	15 0.44
12,718
----------------------------------------------------------
 JaroWinklerDistanceTextSimilarity
	1 1.0
	2 0.56
	3 0.55
	4 0.55
	5 0.54
	6 0.53
	7 0.53
	8 0.53
	9 0.52
	10 0.52
	11 0.52
	12 0.51
	13 0.51
	14 0.51
	15 0.51
16,723
----------------------------------------------------------
 SrensenDiceCoefficientTextSimilarity
	1 1.0
	2 0.37
	3 0.33
	4 0.3
	5 0.3
	6 0.3
	7 0.3
	8 0.27
	9 0.27
	10 0.27
	11 0.27
	12 0.27
	13 0.27
	14 0.27
	15 0.23
19,852
----------------------------------------------------------

21

org.apdplat.word.WordFrequencyStatistics

text.txt
chmod +x wfs.sh & wfs.sh -textFile=text.txt -statisticsResultFile=statistics-result.txt
statistics-result.txt



//
WordFrequencyStatistics wordFrequencyStatistics = new WordFrequencyStatistics();
wordFrequencyStatistics.setRemoveStopWord(false);
wordFrequencyStatistics.setResultPath("word-frequency-statistics.txt");
wordFrequencyStatistics.setSegmentationAlgorithm(SegmentationAlgorithm.MaxNgramScore);
//
wordFrequencyStatistics.seg("");
//
wordFrequencyStatistics.dump();
//
Files.write(Paths.get("text-to-seg.txt"), Arrays.asList("wordJavangram"));
//
wordFrequencyStatistics.reset();
//
wordFrequencyStatistics.seg(new File("text-to-seg.txt"), new File("text-seg-result.txt"));
//
wordFrequencyStatistics.dump("file-seg-statistics-result.txt");



1 2
2 2
3 2
4 1
5 1
6 1
7 1
8 1
9 1
10 1
11 1
12 1
13 1
14 1
15 1



1 2
2 2
3 1
4word 1
5 1
6 1
7ngram 1
8 1
9 1
10 1
11 1
12 1
13 1
14 1
15 1
16 1
17 1
18 1
19 1
20 1
21 1
22Java 1	

22

word

org.apdplat.word.analysis.CosineTextSimilarity

String text1 = "";
String text2 = "";
String text3 = "";
TextSimilarity textSimilarity = new CosineTextSimilarity();
double score1pk1 = textSimilarity.similarScore(text1, text1);
double score1pk2 = textSimilarity.similarScore(text1, text2);
double score1pk3 = textSimilarity.similarScore(text1, text3);
double score2pk2 = textSimilarity.similarScore(text2, text2);
double score2pk3 = textSimilarity.similarScore(text2, text3);
double score3pk3 = textSimilarity.similarScore(text3, text3);
System.out.println(text1+"  "+text1+" "+score1pk1);
System.out.println(text1+"  "+text2+" "+score1pk2);
System.out.println(text1+"  "+text3+" "+score1pk3);
System.out.println(text2+"  "+text2+" "+score2pk2);
System.out.println(text2+"  "+text3+" "+score2pk3);
System.out.println(text3+"  "+text3+" "+score3pk3);



   1.0
   0.67
   0.0
   1.0
   0.0
   1.0

org.apdplat.word.analysis.SimpleTextSimilarity

String text1 = "";
String text2 = "";
String text3 = "";
TextSimilarity textSimilarity = new SimpleTextSimilarity();
double score1pk1 = textSimilarity.similarScore(text1, text1);
double score1pk2 = textSimilarity.similarScore(text1, text2);
double score1pk3 = textSimilarity.similarScore(text1, text3);
double score2pk2 = textSimilarity.similarScore(text2, text2);
double score2pk3 = textSimilarity.similarScore(text2, text3);
double score3pk3 = textSimilarity.similarScore(text3, text3);
System.out.println(text1+"  "+text1+" "+score1pk1);
System.out.println(text1+"  "+text2+" "+score1pk2);
System.out.println(text1+"  "+text3+" "+score1pk3);
System.out.println(text2+"  "+text2+" "+score2pk2);
System.out.println(text2+"  "+text3+" "+score2pk3);
System.out.println(text3+"  "+text3+" "+score3pk3);



   1.0
   0.5
   0.0
   1.0
   0.0
   1.0

org.apdplat.word.analysis.EditDistanceTextSimilarity

String text1 = "";
String text2 = "";
String text3 = "";
TextSimilarity textSimilarity = new EditDistanceTextSimilarity();
double score1pk1 = textSimilarity.similarScore(text1, text1);
double score1pk2 = textSimilarity.similarScore(text1, text2);
double score1pk3 = textSimilarity.similarScore(text1, text3);
double score2pk2 = textSimilarity.similarScore(text2, text2);
double score2pk3 = textSimilarity.similarScore(text2, text3);
double score3pk3 = textSimilarity.similarScore(text3, text3);
System.out.println(text1+"  "+text1+" "+score1pk1);
System.out.println(text1+"  "+text2+" "+score1pk2);
System.out.println(text1+"  "+text3+" "+score1pk3);
System.out.println(text2+"  "+text2+" "+score2pk2);
System.out.println(text2+"  "+text3+" "+score2pk3);
System.out.println(text3+"  "+text3+" "+score3pk3);



   1.0
   0.5
   0.0
   1.0
   0.0
   1.0

SimHash + SimHash

org.apdplat.word.analysis.SimHashPlusHammingDistanceTextSimilarity

String text1 = "";
String text2 = "";
String text3 = "";
TextSimilarity textSimilarity = new SimHashPlusHammingDistanceTextSimilarity();
double score1pk1 = textSimilarity.similarScore(text1, text1);
double score1pk2 = textSimilarity.similarScore(text1, text2);
double score1pk3 = textSimilarity.similarScore(text1, text3);
double score2pk2 = textSimilarity.similarScore(text2, text2);
double score2pk3 = textSimilarity.similarScore(text2, text3);
double score3pk3 = textSimilarity.similarScore(text3, text3);
System.out.println(text1+"  "+text1+" "+score1pk1);
System.out.println(text1+"  "+text2+" "+score1pk2);
System.out.println(text1+"  "+text3+" "+score1pk3);
System.out.println(text2+"  "+text2+" "+score2pk2);
System.out.println(text2+"  "+text3+" "+score2pk3);
System.out.println(text3+"  "+text3+" "+score3pk3);



   1.0
   0.95
   0.83
   1.0
   0.86
   1.0

JaccardJaccard similarity coefficient

org.apdplat.word.analysis.JaccardTextSimilarity

String text1 = "";
String text2 = "";
String text3 = "";
TextSimilarity textSimilarity = new JaccardTextSimilarity();
double score1pk1 = textSimilarity.similarScore(text1, text1);
double score1pk2 = textSimilarity.similarScore(text1, text2);
double score1pk3 = textSimilarity.similarScore(text1, text3);
double score2pk2 = textSimilarity.similarScore(text2, text2);
double score2pk3 = textSimilarity.similarScore(text2, text3);
double score3pk3 = textSimilarity.similarScore(text3, text3);
System.out.println(text1+"  "+text1+" "+score1pk1);
System.out.println(text1+"  "+text2+" "+score1pk2);
System.out.println(text1+"  "+text3+" "+score1pk3);
System.out.println(text2+"  "+text2+" "+score2pk2);
System.out.println(text2+"  "+text3+" "+score2pk3);
System.out.println(text3+"  "+text3+" "+score3pk3);



   1.0
   0.5
   0.0
   1.0
   0.0
   1.0

Euclidean Distance

org.apdplat.word.analysis.EuclideanDistanceTextSimilarity

String text1 = "";
String text2 = "";
String text3 = "";
TextSimilarity textSimilarity = new EuclideanDistanceTextSimilarity();
double score1pk1 = textSimilarity.similarScore(text1, text1);
double score1pk2 = textSimilarity.similarScore(text1, text2);
double score1pk3 = textSimilarity.similarScore(text1, text3);
double score2pk2 = textSimilarity.similarScore(text2, text2);
double score2pk3 = textSimilarity.similarScore(text2, text3);
double score3pk3 = textSimilarity.similarScore(text3, text3);
System.out.println(text1+"  "+text1+" "+score1pk1);
System.out.println(text1+"  "+text2+" "+score1pk2);
System.out.println(text1+"  "+text3+" "+score1pk3);
System.out.println(text2+"  "+text2+" "+score2pk2);
System.out.println(text2+"  "+text3+" "+score2pk3);
System.out.println(text3+"  "+text3+" "+score3pk3);



   1.0
   0.41
   0.29
   1.0
   0.29
   1.0

Manhattan Distance

org.apdplat.word.analysis.ManhattanDistanceTextSimilarity

String text1 = "";
String text2 = "";
String text3 = "";
TextSimilarity textSimilarity = new ManhattanDistanceTextSimilarity();
double score1pk1 = textSimilarity.similarScore(text1, text1);
double score1pk2 = textSimilarity.similarScore(text1, text2);
double score1pk3 = textSimilarity.similarScore(text1, text3);
double score2pk2 = textSimilarity.similarScore(text2, text2);
double score2pk3 = textSimilarity.similarScore(text2, text3);
double score3pk3 = textSimilarity.similarScore(text3, text3);
System.out.println(text1+"  "+text1+" "+score1pk1);
System.out.println(text1+"  "+text2+" "+score1pk2);
System.out.println(text1+"  "+text3+" "+score1pk3);
System.out.println(text2+"  "+text2+" "+score2pk2);
System.out.println(text2+"  "+text3+" "+score2pk3);
System.out.println(text3+"  "+text3+" "+score3pk3);



   1.0
   0.33
   0.14
   1.0
   0.14
   1.0

JaroJaro Distance

org.apdplat.word.analysis.JaroDistanceTextSimilarity

String text1 = "";
String text2 = "";
String text3 = "";
TextSimilarity textSimilarity = new JaroDistanceTextSimilarity();
double score1pk1 = textSimilarity.similarScore(text1, text1);
double score1pk2 = textSimilarity.similarScore(text1, text2);
double score1pk3 = textSimilarity.similarScore(text1, text3);
double score2pk2 = textSimilarity.similarScore(text2, text2);
double score2pk3 = textSimilarity.similarScore(text2, text3);
double score3pk3 = textSimilarity.similarScore(text3, text3);
System.out.println(text1+"  "+text1+" "+score1pk1);
System.out.println(text1+"  "+text2+" "+score1pk2);
System.out.println(text1+"  "+text3+" "+score1pk3);
System.out.println(text2+"  "+text2+" "+score2pk2);
System.out.println(text2+"  "+text3+" "+score2pk3);
System.out.println(text3+"  "+text3+" "+score3pk3);



   1.0
   0.67
   0.0
   1.0
   0.0
   1.0

JaroWinklerJaroWinkler DistanceJaro

org.apdplat.word.analysis.JaroWinklerDistanceTextSimilarity

String text1 = "";
String text2 = "";
String text3 = "";
TextSimilarity textSimilarity = new JaroWinklerDistanceTextSimilarity();
double score1pk1 = textSimilarity.similarScore(text1, text1);
double score1pk2 = textSimilarity.similarScore(text1, text2);
double score1pk3 = textSimilarity.similarScore(text1, text3);
double score2pk2 = textSimilarity.similarScore(text2, text2);
double score2pk3 = textSimilarity.similarScore(text2, text3);
double score3pk3 = textSimilarity.similarScore(text3, text3);
System.out.println(text1+"  "+text1+" "+score1pk1);
System.out.println(text1+"  "+text2+" "+score1pk2);
System.out.println(text1+"  "+text3+" "+score1pk3);
System.out.println(text2+"  "+text2+" "+score2pk2);
System.out.println(text2+"  "+text3+" "+score2pk3);
System.out.println(text3+"  "+text3+" "+score3pk3);



   1.0
   0.73
   0.0
   1.0
   0.0
   1.0

SrensenDiceSrensenDice coefficient2

org.apdplat.word.analysis.SrensenDiceCoefficientTextSimilarity

String text1 = "";
String text2 = "";
String text3 = "";
TextSimilarity textSimilarity = new SrensenDiceCoefficientTextSimilarity();
double score1pk1 = textSimilarity.similarScore(text1, text1);
double score1pk2 = textSimilarity.similarScore(text1, text2);
double score1pk3 = textSimilarity.similarScore(text1, text3);
double score2pk2 = textSimilarity.similarScore(text2, text2);
double score2pk3 = textSimilarity.similarScore(text2, text3);
double score3pk3 = textSimilarity.similarScore(text3, text3);
System.out.println(text1+"  "+text1+" "+score1pk1);
System.out.println(text1+"  "+text2+" "+score1pk2);
System.out.println(text1+"  "+text3+" "+score1pk3);
System.out.println(text2+"  "+text2+" "+score2pk2);
System.out.println(text2+"  "+text3+" "+score2pk3);
System.out.println(text3+"  "+text3+" "+score3pk3);



   1.0
   0.67
   0.0
   1.0
   0.0
   1.0

23

:
unix-like:
	chmod +x sentence-identify.sh & ./sentence-identify.sh
windows:
	./sentence-identify.bat
 org.apdplat.word.analysis.SentenceIdentify :

1. : , : 0.71428573
2. : , : 0.6666667
3. : , : 0.5
4. : , : 0.5
5. : , : 0.2857143
6. : , : 0.2857143
7. : , : 0.25
8. : , : 0.22222222
9. : , : 0.2
10. : , : 0.2



:
:
: [, , , , , ]
: 
: 1.0

:
:
: [, , , , , , , , , , , , ]
: 
: 0.8333333

1word Ngram
370.9714 /
66.55%  33.44%  2533709  1686210  847499
60.94% 39.05% 28374490 17293964 11080526

2word 
330.1586 /
65.67%  34.32%  2533709  1663958  869751
60.12% 39.87% 28374490 17059641 11314849

3word 
62.960262 /
57.2%  42.79%  2533709  1449288  1084421
47.95% 52.04% 28374490 13605742 14768748

4word 
462.87158 /
53.06%  46.93%  2533709  1344624  1189085
43.07% 56.92% 28374490 12221610 16152880

5word 
967.68604 /
46.34%  53.65%  2533709  1174276  1359433
36.07% 63.92% 28374490 10236574 18137916

6word 
661.148 /
46.18%  53.81%  2533709  1170075  1363634
35.65% 64.34% 28374490 10117122 18257368

7word 
1567.1318 /
41.88%  58.11%  2533709  1061189  1472520
31.35% 68.64% 28374490 8896173 19478317

8word 
1232.6017 /
41.69%  58.3%  2533709  1056515  1477194
30.98% 69.01% 28374490 8792532 19581958

9word 
1936.9575 /
41.42%  58.57%  2533709  1049673  1484036
31.34% 68.65% 28374490 8893622 19480868

10word 
2228.9465 /
36.7%  63.29%  2533709  930069  1603640
26.72% 73.27% 28374490 7583741 20790749

1

2

3

4

5

6ngram

7

8

99Java

1011946

119271

12

13word

14word

15word

Javacws_evaluation

JavaQuestionAnsweringSystem

Javaword_webwebword

An Implementation of Double-Array Trie

MMSEG: A Word Identification System for Mandarin Chinese Text Based on Two Variants of the Maximum Matching Algorithm

With Googles new tool Ngram Viewer, you can visualise the rise and fall of concepts across 5 million books and 500 years!

word2vec

https://travis-ci.org/ysc/word