': + if docPerParagraph: + write(date, title, pending, fOut) + else: + pending.append('PARSEP') + elif not reTagOnly.match(line): + pending.append(line) + if title is not None and len(pending) > 0: + write(date, title, pending, fOut) + + didEnglish = True + +# '/x/lucene/data/europarl/all.lines.txt' +dirIn = sys.argv[1] +fileOut = sys.argv[2] + +fOut = open(fileOut, 'wb') + +for fileName in glob.glob('%s/??-??.tgz' % dirIn): + if fileName.endswith('.tgz'): + print 'process %s; %d docs so far...' % (fileName, docCount) + processTar(fileName, fOut) + +print 'TOTAL: %s' % docCount + +#run something like this: +""" + +# Europarl V5 makes 76,917 docs, avg 38.6 KB per +python -u europarl.py /x/lucene/data/europarl /x/lucene/data/europarl/tmp.lines.txt +shuf /x/lucene/data/europarl/tmp.lines.txt > /x/lucene/data/europarl/full.lines.txt +rm /x/lucene/data/europarl/tmp.lines.txt + +# Run again, this time each paragraph is a doc: +# Europarl V5 makes 5,607,746 paragraphs (one paragraph per line), avg 620 bytes per: +python -u europarl.py /x/lucene/data/europarl /x/lucene/data/europarl/tmp.lines.txt -docPerParagraph +shuf /x/lucene/data/europarl/tmp.lines.txt > /x/lucene/data/europarl/para.lines.txt +rm /x/lucene/data/europarl/tmp.lines.txt + +# ~5.5 MB gzip'd: +head -200 /x/lucene/data/europarl/full.lines.txt > tmp.txt +head -10000 /x/lucene/data/europarl/para.lines.txt >> tmp.txt +shuf tmp.txt > europarl.subset.txt +rm -f tmp.txt +gzip --best europarl.subset.txt +"""