see my blog post for more info

preprocess

stanford parser

cd /mnt
wget http://nlp.stanford.edu/downloads/stanford-parser-2011-09-14.tgz
tar zxf stanford-parser-2011-09-14.tgz

freebase dump

wget http://download.freebase.com/wex/latest/freebase-wex-2011-09-30-articles.tsv.bz2  # 7/8gb

freebase just text to articles without newlines or <, >

bzcat freebase-wex-2011-09-30-articles.tsv.bz2 | cut -f5 | perl -plne's/\\n\\n/ /g' | sed -es/[\<\>]/\ /g > articles

extract sentences

java -classpath /mnt/stanford-parser-2011-09-14/stanford-parser.jar edu.stanford.nlp.process.DocumentPreprocessor articles > sentences

move to hdfs

hadoop fs -mkdir sentences
hadoop fs -copyFromLocal sentences sentences/sentences

filter out long "terms", urls and terms without at least one alpha numeric

-- clean_sentences.pig
s = load 'sentences' as (s:chararray);
define chw `python cut_huge_words.py` ship('cut_huge_words.py');
s = stream s through chw;
define fsnc `python filter_single_non_char.py` ship('filter_single_non_char.py');
s = stream s through fsnc;
store s into 'sentences.filtered'

note! seems to be the same sentences repeated 2-3 times (???) wrote a pig job to get rid of them...

 -- distinct_sentences.pig
 s = load 'sentences';
 d = distinct s;
 store d into 'sentences.distinct';

extract ngrams

extract

 -- extract_ngrams.pig
define ngrams(s, n, out) returns void {
 define ngrams `python ngrams.py $n` ship('ngrams.py');
 ngrams = stream $s through ngrams;
 store ngrams into '$out';
}
sentences = load 'sentences' as (sentence:chararray);
ngrams(sentences, 1, 'unigrams');
ngrams(sentences, 2, 'bigrams');
ngrams(sentences, 3, 'trigrams');

sanity check frequency of unigram lengths..

   -- cat unigram_length_freq.pig
   set default_parallel 24;
   register 'pig/piggybank.jar';
   define len org.apache.pig.piggybank.evaluation.string.LENGTH;
   unigrams = load 'unigrams' as (t1:chararray);
   lengths = foreach unigrams generate len(t1) as length;
   grped = group lengths by length;
   freqs = foreach grped generate group as length, COUNT(lengths) as freq;
   store freqs into 'length_freqs';

   $ hfs -cat /user/hadoop/length_freqs/*  |sort -n
   1       180552974
   2       218916294
   3       229329881
   4       171347574
   5       154830964
   6       111346344
   7       104234492
   8       79782236
   9       54408813
   10      36727785
   11      19836470
   12      12233497
   13      6588796
   14      3133951
   15      1394993
   ...
   93      14
   94      18
   95      39
   96      25
   97      14
   98      13
   99      15
   100     25

n-gram quantiles

get datafu

wget https://github.com/downloads/linkedin/datafu/datafu-0.0.1.tar.gz
tar zxf datafu-0.0.1.tar.gz
cd datafu-0.0.1
ant

extract quantiles

( if i was to redo this i'd use StreamingQuantile to avoid the sort )

-- quantiles.pig
set default_parallel 36;
register 'datafu-0.0.1/dist/datafu-0.0.1.jar';
define percentiles datafu.pig.stats.Quantile('0.9','0.91','0.92','0.93','0.94','0.95','0.96','0.97','0.98','0.99','0.991','0.992','0.993','0.994','0.995','0.996','0.997','0.998','0.999','1.0');
define quantiles(A, key, out) returns void {
 grped_f = group $A by $key;
 freqs = foreach grped_f generate COUNT($A) as freq;
 grped = group freqs all; 
 quantiles = foreach grped {          
  sorted = order freqs by freq;
  generate flatten(percentiles(sorted));
 }
 store quantiles into '$out';
}
unigrams = load 'unigrams' as (t1:chararray);
quantiles(unigrams, t1, unigrams_quantiles);
bigrams = load 'bigrams' as (t1:chararray, t2:chararray);
quantiles(bigrams, '(t1,t2)', bigrams_quantiles);

result

$ hfs -cat unigrams_quantiles/part-r-00000
12.0    14.0    16.0    20.0    25.0    32.0    45.0    69.0    129.0   392.0   465.0   562.0   693.0   882.0   1177.0  1679.0  2574.0  4703.0  12582.0 7.4528781E7
hadoop@ip-10-17-216-123:~$ hfs -cat bigrams_quantiles/part-r-00000
6.0     7.0     8.0     10.0    11.0    14.0    19.0    27.0    43.0    99.0    112.0   129.0   151.0   180.0   222.0   286.0   396.0   621.0   1303.0  1.2184383E7

extract ngram_frequecy and counts

-- ngram_freq_and_count.pig
define calc_frequencies_and_count(A, key, F, C) returns void {
 grped = group $A by $key;
 ngram_f = foreach grped generate flatten(group), COUNT($A) as freq;
 store ngram_f into '$F';
 grped = group ngram_f all;
 ngram_c = foreach grped generate SUM(ngram_f.freq) as count; 
 store ngram_c into '$C';
}
unigrams = load 'unigrams' as (t1:chararray);
calc_frequencies_and_count(unigrams, t1, unigram_f, unigram_c);
bigrams = load 'bigrams' as (t1:chararray, t2:chararray);
calc_frequencies_and_count(bigrams, '(t1,t2)', bigram_f, bigram_c);
trigrams = load 'trigrams' as (t1:chararray, t2:chararray, t3:chararray);
calc_frequencies_and_count(trigrams, '(t1,t2,t3)', trigram_f, trigram_c)

high frequency ngrams

-- top_ngrams_by_freq.pig
n = load 'unigram_f' as (t1:chararray, freq:long);
s = order n by freq desc;
s = limit s by 10;
store s into 'unigram_top10';
n = load 'bigram_f' as (t1:chararray, t2:chararray, freq:long);
s = order n by freq desc;
s = limit s by 10;
store s into 'bigram_top10';
n = load 'trigram_f' as (t1:chararray, t2:chararray, t3:chararray, freq:long);
s = order n by freq desc;
s = limit s by 10;
store s into 'trigram_top10';

result

$ hfs -cat /user/hadoop/unigram_top10/*
the  74528781
,    70605655
.    54902186
of   41340440
and  34962970
in   30111358
to   24904598
a    24600822
was  14322281
is   13485661

$ hfs -cat /user/hadoop/bigram_top10/*
of the  12184383
in the  8042527
, and   7223201
, the   4756776
to the  4077474
is a    2913727
and the 2740848
on the  2657395
| |     2615179
for the 2395016

$ hfs -cat /user/hadoop/trigram_top10/*
| | |             805094
, and the 	  767814
one of the 	  617374
-RRB- is a 	  562709
| - | 	 	  516652
as well as 	  504222
the United States 504124
part of the       413992
| align =         352575
-RRB- , and       290050

bigram mutual info

-- bigram_mutual_info.pig

-- load up frequencies and counts
unigram_f = load 'unigram_f' as (t1:chararray, freq:long);
unigram_c = load 'unigram_c' as (count:long);
bigram_f = load 'bigram_f' as (t1:chararray, t2:chararray, freq:long);
bigram_c = load 'bigram_c' as (count:long);

-- only process bigrams with a support of 5000
bigram_f = filter bigram_f by freq>5000;

-- bigram log likelihood
-- log2( p(a,b) / ( p(a) * p(b) ) )
bigram_joined_1 = join bigram_f by t1, unigram_f by t1;
bigram_joined_2 = join bigram_joined_1 by t2, unigram_f by t1;
mutual_info = foreach bigram_joined_2 {
 t1 = bigram_joined_1::bigram_f::t1;
 t2 = bigram_joined_1::bigram_f::t2;
 t1_t2_f = bigram_joined_1::bigram_f::freq;
 t1_f = bigram_joined_1::unigram_f::freq;
 t2_f = unigram_f::freq;

 pxy = (double)t1_t2_f / bigram_c.count;
 px = (double)t1_f / unigram_c.count;
 py = (double)t2_f / unigram_c.count;
 mutual_info = LOG(pxy / (px * py)) / LOG(2);

 generate t1, t2, t1_t2_f, mutual_info as mi;
}

-- sort and store
positive_mutual_info = filter mutual_info by mi>0;
sorted = order positive_mutual_info by mi desc;
store sorted into 'bigram_mutual_info';

-- dump top 10k, support 5,000
sorted = order bigram_mi by mi desc;
top_10k_s5k = limit sorted 10000;
store top_10k_s5k into 'bigram_mutual_info__top10k_s5k.gz';

-- dump top 10k, support 50,000
sup = filter bigram_mi by t1_t2_f > 50000;
sorted = order sup by mi desc;
top_10k = limit sorted 10000;
store top_10k into 'bigram_mutual_info__top10k_s50k.gz';

-- dump top 10k, support 250,000
sup = filter bigram_mi by t1_t2_f > 250000;
sorted = order sup by mi desc;
top_10k = limit sorted 10000;
store top_10k into 'bigram_mutual_info__top10k_s250k.gz';

result

$ hfs -cat /user/hadoop/bigram_mutual_info__top10k_s5k.gz/part-r-00000.gz | gunzip | head
mutual_info             t1      t2        t12_f   bigram_c        t1_f    t2_f    unigram_c
12.397738367262338      Burkina Faso      5417    1331695519      5687    5679    1386868488
12.136124155313361      Rotten  Tomatoes  5695    1331695519      7264    6072    1386868488
12.113289381980444      Kuala   Lumpur    6441    1331695519      7847    6504    1386868488
11.72023878663395       Tel     Aviv      9106    1331695519      11212   9534    1386868488
11.679730019305111      Baton   Rouge     5587    1331695519      6219    10982   1386868488
11.396162983156035      Figure  Skating   5518    1331695519      11164   8023    1386868488
11.391231883436134      Lok     Sabha     7429    1331695519      8908    13604   1386868488
11.169792546284583      Notre   Dame      13516   1331695519      13845   19872   1386868488
11.127412869936919      Buenos  Aires     20595   1331695519      20908   20919   1386868488
11.038927639150135      gastropod mollusk 19335   1331695519      22195   20212   1386868488wordsplitter.py

hadoop@ip-10-17-216-123:~$ hfs -cat /user/hadoop/bigram_mutual_info__top10k_s50k.gz/part-r-00000.gz | gunzip | head
9.752548540247900       Hong    Kong    67506   1331695519      74127   76481   1386868488
9.100741649871587       Los     Angeles 134207  1331695519      158036  136862  1386868488
9.04193275319404        Summer  Olympics 50573   1331695519      92912   93036   1386868488
8.867283927960393       Prime   Minister 64629   1331695519      79612   165235  1386868488
8.834295951388988       median  income  106184  1331695519      116870  191133  1386868488
8.697542938085967       Supreme Court   56347   1331695519      72797   186693  1386868488
8.58353850698748        \       t       110564  1331695519      232890  128335  1386868488
8.534865092105356       square  mile    57875   1331695519      175459  93613   1386868488
8.349655012939401       San     Francisco 83370   1331695519      277709  102536  1386868488
8.314182119206784       Air     Force   92015   1331695519      218205  149230  1386868488

hadoop@ip-10-17-216-123:~$ hfs -cat /user/hadoop/bigram_mutual_info__top10k_s250k.gz/part-r-00000.gz | gunzip | head
7.226096947140881       United  States  719664  1331695519      1005315 752037  1386868488
7.125046765096364       align   =       401440  1331695519      404672  1152961 1386868488
7.004555445216198       New     York    512842  1331695519      1209924 555714  1386868488
5.8550085688353946      did     not     310806  1331695519      545985  2356006 1386868488
5.708106314799068       more    than    273197  1331695519      1346085 972904  1386868488
5.19926746463544        |       align   352579  1331695519      6947135 404672  1386868488
5.176074269794974       can     be      492695  1331695519      1235963 3253103 1386868488
5.158784004550341       have    been    497399  1331695519      2157275 1914404 1386868488
5.051402270277741       has     been    650292  1331695519      3140106 1914404 1386868488
4.813860313764799       =       ``      638122  1331695519      1152961 6488163 1386868488

trigram mutual info

-- trigram_mutual_info.pig

-- load up frequencies and counts
unigram_f = load 'unigram_f' as (t1:chararray, freq:long);
unigram_c = load 'unigram_c' as (count:long);
bigram_f = load 'bigram_f' as (t1:chararray, t2:chararray, freq:long);
bigram_c = load 'bigram_c' as (count:long);
trigram_f = load 'trigram_f' as (t1:chararray, t2:chararray, t3:chararray, freq:long);
trigram_c = load 'trigram_c' as (count:long);

-- only consider trigrams with a support > 1000
trigram_f = filter trigram_f by freq > 100;

-- join; like a boss (a boss who doesn't know how to write idiomatic pig joins)
j1 = join trigram_f by t1, unigram_f by t1;
j1 = foreach j1 generate trigram_f::t1 as t1, t2 as t2, t3 as t3, trigram_f::freq as tri_f, unigram_f::freq as t1_f;
j2 = join j1 by t2, unigram_f by t1;
j2 = foreach j2 generate j1::t1 as t1, t2 as t2, t3 as t3, tri_f as tri_f, t1_f as t1_f, unigram_f::freq as t2_f;
j3 = join j2 by t3, unigram_f by t1;
j3 = foreach j3 generate j2::t1 as t1, t2 as t2, t3 as t3, tri_f as tri_f, t1_f as t1_f, t2_f as t2_f, unigram_f::freq as t3_f;
j4 = join j3 by (t1,t2), bigram_f by (t1,t2);
j4 = foreach j4 generate j3::t1 as t1, j3::t2 as t2, j3::t3 as t3, tri_f as tri_f, t1_f as t1_f, t2_f as t2_f, t3_f as t3_f, bigram_f::freq as t1_t2_f;
j5 = join j4 by (t2,t3), bigram_f by (t1,t2);
j5 = foreach j5 generate j4::t1 as t1, j4::t2 as t2, j4::t3 as t3, tri_f as tri_f, t1_f as t1_f, t2_f as t2_f, t3_f as t3_f, t1_t2_f as t1_t2_f, bigram_f::freq as t2_t3_f;

-- mi = log2( p(x,y,z) / (p(x)*p(y)*p(z) + p(x)*p(y,z) + p(x,y)*p(z)) )
mutual_info = foreach j5 {
 px = (double)t1_f / unigram_c.count;
 py = (double)t2_f / unigram_c.count;
 pz = (double)t3_f / unigram_c.count;
 pxy = (double)t1_t2_f / bigram_c.count;
 pyz = (double)t2_t3_f / bigram_c.count;

 dep_pxyz = (double)tri_f / trigram_c.count;

 px_py_pz = px * py * pz;
 px_pyz   = px * pyz;
 pxy_pz   = pxy * pz;
 indep_pxyz = px_py_pz + px_pyz + pxy_pz;
 
 mi = LOG(dep_pxyz / indep_pxyz) / LOG(2);
 
 generate t1,t2,t3, tri_f, mi as mi;
}

-- sort and store
positive_mutual_info = filter mutual_info by mi>0;
sorted = order positive_mutual_info by mi desc;
store sorted into 'trigram_mutual_info';

result (support 100)

Rychnov	nad	Kněžnou	118	22.424602293608007
DoualaPays	:	CamerounAdresse	102	22.406178326732338
Bajaga		i	Instruktori	118	22.385818095638346
d'activité	:	Principaux	128	22.367493053820052
Feasa		ar	Éirinn		121	22.214224197726296
Séré		de	Rivières	109	22.15457802657505
Tiako		I	Madagasikara	134	22.149050193285223
African-Eurasian	Migratory	Waterbirds	145	22.103210375266972
Kw			`		alaams		110	22.09419756064452
Ponts			et		Chaussées	106	22.084725134502357

result (support 1000)

Abdu	`	l-Bahá	1011	549844.1721224862
Dravida	Munnetra	Kazhagam	1043	519490.7200429379
Ab	urbe		condita		1059	392353.38202227355
Dar	es		Salaam		1130	280500.2163137233
Kitts	and		Nevis		1095	266394.29635043273
Procter	&		Gamble		1255	256374.4275852869
Antigua	and		Barbuda		1290	245226.03987268283
agnostic		or		atheist	1068	235635.2233948117
Vasco			da		Gama	1401	224613.90052744994
Ku			Klux		Klan	1944	224200.36813564494

graphs

sh>
 hfs -cat bigram_mutual_info/* | ./reparse_ngram_mi.py > bigram_mutual_info.tsv
 hfs -cat trigram_mutual_info/* | ./reparse_ngram_mi.py > trigram_mutual_info.tsv

R>
 b = read.delim('bigram_mutual_info.tsv', sep="\t", header=F)
 ggplot(b, aes(log10(V2),V3)) + geom_point(alpha=1/5) + xlab('log bigram freq') + ylab('mutual info') + opts(title="bigram mutual info")
 t = read.delim('trigram_mutual_info.tsv', sep="\t", header=F)
 t = t[t$V2>750,]
 ggplot(t, aes(log10(V2),V3)) + geom_point(alpha=1/5) + xlab('log trigram freq') + ylab('mutual info') + opts(title="trigram mutual info")

pareto front

hfs -cat /user/hadoop/trigram_mutual_info/* | ./reparse_ngram_mi.py > trigram_mutual_info.tsv
cat trigram_mutual_info.tsv | ./pareto.py > trigram_mutual_info.pareto.tsv

bigrams at a distance

first filter out bracket tokens (-LRB- and -RRB-) and any word that doesnt have at least one letter

hadoop jar ~/contrib/streaming/hadoop-streaming.jar \
 -input sentences_sans_url_long_words -output sentences_single_char_filtered \
 -mapper filter_single_non_char.py -file filter_single_non_char.py \
 -numReduceTasks 0

rebuild unigrams & bigrams

hadoop jar ~/contrib/streaming/hadoop-streaming.jar \
 -input sentences_single_char_filtered -output unigrams_d \
 -mapper "ngrams.py 1" -file ngrams.py \
 -numReduceTasks 0
hadoop jar ~/contrib/streaming/hadoop-streaming.jar \
 -input sentences_single_char_filtered -output bigrams_d \
 -mapper bigrams_at_a_distance.py -file bigrams_at_a_distance.py \
 -numReduceTasks 0

from 55,172,969 sentences to 3,154,200,111 bigrams

re calc _f and c for unigrams and bigrams_d

-- ngram_d_freq_and_count.pig
define calc_frequencies_and_count(A, key, F, C) returns void {
 grped = group $A by $key;
 ngram_f = foreach grped generate flatten(group), COUNT($A) as freq;
 store ngram_f into '$F';
 grped = group ngram_f all;
 ngram_c = foreach grped generate SUM(ngram_f.freq) as count; 
 store ngram_c into '$C';
}
unigrams = load 'unigrams_d' as (t1:chararray);
calc_frequencies_and_count(unigrams, t1, unigram_d_f, unigram_d_c);
bigrams = load 'bigrams_d' as (t1:chararray, t2:chararray);
calc_frequencies_and_count(bigrams, '(t1,t2)', bigram_d_f, bigram_d_c);

result

unigram_d_f (dus) 95,097,917
unigram_d_c = 1,161,564,928
bigram_d_f (dus) 5,876,653,769
bigram_d_c = 3,154,200,111

$ hfs -cat /user/hadoop/bigram_d_mutual_info/part-r-00000 | head -n 50
expr    expr    20888   17.048076978103953
ifeq    ifeq    6507    16.186081768917436
Burkina Faso    5473    16.145461115766583
Rotten  Tomatoes        5705    15.755724868405853
Kuala   Lumpur  6457    15.723829544745481
SO      Strikeouts      5788    15.564872663796615
Masovian        east-central    8452    15.412136208849287
Earned  SO      5651    15.40456407733207
Wins    Losses  7984    15.239019810631586
Tel     Aviv    9137    15.158101160760133
Baton   Rouge   5599    15.097851626874347
Dungeons        Dragons 5509    14.84334814467409
Trinidad        Tobago  6241    14.770530324750123
Figure  Skating 5528    14.68826766933851
Lok     Sabha   7435    14.679706153380339
background-color        E9E9E9  8490    14.652837972898762
Haleakala       NEAT    5328    14.514309090636834
Kitt    Spacewatch      17854   14.435471379604573

mean / stddev

build token distances

hadoop fs -mkdir token_distance
hadoop jar ~/contrib/streaming/hadoop-streaming.jar \
 -input sentences -output token_distance/bigram_distance \
 -mapper token_distance.py -file token_distance.py \
 -numReduceTasks 0

55,378,018 sentences -> 20,089,376,334 pairs

calc mean/sd of all pairs

-- mean_sd.pig
-- see http://en.wikipedia.org/wiki/Standard_deviation#Rapid_calculation_methods
td = load 'token_distance/bigram_distance' as (t1:chararray, t2:chararray, distance:int);
tds = foreach td generate t1, t2, distance, distance*distance as distance_sqr;
grped = group tds by (t1,t2);

mean_sd = foreach grped {
 n  = COUNT(tds);
 s1 = SUM(tds.distance);
 s2 = SUM(tds.distance_sqr);

 mean = (double)s1/n;

 sd_nomin = (double)(n*s2 - s1*s1);
 sd_denom = n * (n-1);
 sd = SQRT( sd_nomin / sd_denom );

 generate flatten(group), n, mean, sd;
}

store mean_sd into 'token_distance/mean_sd';

calculate quantiles of mean / sd values (sanity)

-- mean_sd_quantiles.pig 
register /mnt/datafu/dist/datafu-0.0.1.jar
define Quantile datafu.pig.stats.StreamingQuantile('101');
define calc_quantiles(src, field, out) returns void {
 just_field = foreach $src generate $field;
 grped = group just_field all;
 quantiles = foreach grped generate Quantile(just_field);
 store quantiles into '$out';
}
mean_sd = load 'token_distance/mean_sd' as (t1:chararray, t2:chararray, n:int, mean:double, stddev:double);
calc_quantiles(mean_sd, mean, 'token_distance/mean_quantiles');
calc_quantiles(mean_sd, stddev, 'token_distance/stddev_quantiles');

hadoop@ip-10-17-57-166:~$ hfs -cat /user/hadoop/token_distance/mean_quantiles/* (-9.0,-8.0,-7.5,-7.0,-6.5,-6.0,-6.0,-5.5,-5.375,-5.0,-5.0,-4.5,-4.4,-4.0,-4.0,-3.75,-3.5,-3.4,-3.1683168316831685,-3.0,-3.0,-2.7777777777777777,-2.607142857142857,-2.5,-2.4,-2.25,-2.0,-2.0,-2.0,-1.875,-1.6944444444444444,-1.6,-1.5,-1.5,-1.3333333333333333,-1.2352941176470589,-1.0588235294117647,-1.0,-1.0,-1.0,-0.8888888888888888,-0.75,-0.6666666666666666,-0.5157894736842106,-0.5,-0.5,-0.34782608695652173,-0.2702702702702703,-0.16666666666666666,0.0,0.0,0.0,0.02502691065662002,0.2,0.3076923076923077,0.4,0.5,0.5,0.5680933852140078,0.6666666666666666,0.7777777777777778,0.9354838709677419,1.0,1.0,1.0,1.1111111111111112,1.25,1.3333333333333333,1.5,1.5,1.625,1.75,1.9714285714285715,2.0,2.0,2.1,2.3214285714285716,2.48,2.5,2.6666666666666665,2.8620689655172415,3.0,3.0,3.25,3.5,3.5238095238095237,3.8333333333333335,4.0,4.142857142857143,4.5,4.666666666666667,5.0,5.0,5.5,5.666666666666667,6.0,6.333333333333333,6.666666666666667,7.0,7.666666666666667,9.0)

hadoop@ip-10-17-57-166:~$ hfs -cat /user/hadoop/token_distance/stddev_quantiles/* (0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.5,0.6030226891555273,0.7071067811865476,0.7071067811865476,0.7071067811865476,0.7071067811865476,1.1547005383792515,1.4142135623730951,1.4142135623730951,1.4142135623730951,1.4142135623730951,1.6278820596099706,1.8085096269323728,2.073644135332772,2.1213203435596424,2.1213203435596424,2.1213203435596424,2.23606797749979,2.4257748058625186,2.6076809620810595,2.819996622760558,2.8284271247461903,2.8284271247461903,2.886751345948129,3.03315017762062,3.2145502536643185,3.420526275297414,3.519648784380674,3.5355339059327378,3.5355339059327378,3.669695718539436,3.8172540616821107,4.0,4.08248290463863,4.203173404306164,4.242640687119285,4.242640687119285,4.343246179462463,4.444097208657794,4.541892543729748,4.618802153517006,4.715628712080461,4.793585457511062,4.88535225614967,4.949747468305833,4.949747468305833,4.979959839195493,5.036533199202271,5.118869382571086,5.18613073726338,5.237229365663817,5.297106327329854,5.366563145999495,5.440588203494177,5.504543577809154,5.5506327395643655,5.609515724790034,5.656854249492381,5.656854249492381,5.7016464227295405,5.770615218501403,5.830951894845301,5.894913061275798,5.9717615272398445,6.0332412515993425,6.09831551536455,6.18465843842649,6.25642526870234,6.3508529610858835,6.363961030678928,6.3979848801256,6.494261974735176,6.597558470963162,6.685805860178712,6.8452277432633934,6.990072325170257,7.0710678118654755,7.0710678118654755,7.211102550927978,7.41057802513857,7.637626158259733,7.7781745930520225,7.788880963698615,8.144527815247077,8.48528137423857,8.48528137423857,9.192388155425117,9.192388155425117,9.899494936611665,10.606601717798213,12.727922061357855)

todo:

mean_sd analysis
pos tagging and mi on AN and ANN phrases
pareto front analysis
likelihood ratios

embedded pig notes

download and jar xf http://hivelocity.dl.sourceforge.net/project/jython/jython/2.5.2/jython_installer-2.5.2.jar

x = load 'numbers'; y = filter x by $0>3;

java -cp pig/pig.jar:jython/jython.jar org.apache.pig.Main -x local -g jython test.py

#!/usr/bin/python from org.apache.pig.scripting import *

def main(): P = Pig.compile(""" x = load 'numbers'; y = filter x by $0>3; store y into '$out'; """ ) out = 'test.out' job = P.bind().runSingle() print 'success?', job.isSuccessful() print 'result', job.result('out')

if name == 'main': main()

Cheap and reliable Node.js hosting starts at $3/month, and $1/month static HTML hosting

matpalm / collocations

Programming Languages

preprocess

stanford parser

freebase dump

freebase just text to articles without newlines or <, >

extract sentences

move to hdfs

filter out long "terms", urls and terms without at least one alpha numeric

extract ngrams

extract

sanity check frequency of unigram lengths..

n-gram quantiles

extract ngram_frequecy and counts

high frequency ngrams

bigram mutual info

trigram mutual info

graphs

pareto front

bigrams at a distance

mean / stddev

embedded pig notes