Index: modules/analysis/common/build.xml =================================================================== --- modules/analysis/common/build.xml (revision 948862) +++ modules/analysis/common/build.xml (working copy) @@ -49,7 +49,7 @@ nobak="on"/> - + @@ -68,8 +68,33 @@ - + + + + + + + + + + + + + + + + + + + + Index: modules/analysis/common/src/java/org/apache/lucene/analysis/standard/ASCIITLD.jflex-macro =================================================================== --- modules/analysis/common/src/java/org/apache/lucene/analysis/standard/ASCIITLD.jflex-macro (revision 0) +++ modules/analysis/common/src/java/org/apache/lucene/analysis/standard/ASCIITLD.jflex-macro (revision 0) @@ -0,0 +1,307 @@ +/* + * Copyright 2001-2005 The Apache Software Foundation. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +// Generated from IANA Root Zone Database +// file version from May 27, 2010 11:34:17 AM +// generated on May 27, 2010 3:45:23 PM +// by org.apache.lucene.analysis.standard.GenerateJflexTLDMacros + +ASCIITLD = \\. ( + [aA][cC] + | [aA][dD] + | [aA][eE] + | [aA][eE][rR][oO] + | [aA][fF] + | [aA][gG] + | [aA][iI] + | [aA][lL] + | [aA][mM] + | [aA][nN] + | [aA][oO] + | [aA][qQ] + | [aA][rR] + | [aA][rR][pP][aA] + | [aA][sS] + | [aA][sS][iI][aA] + | [aA][tT] + | [aA][uU] + | [aA][wW] + | [aA][xX] + | [aA][zZ] + | [bB][aA] + | [bB][bB] + | [bB][dD] + | [bB][eE] + | [bB][fF] + | [bB][gG] + | [bB][hH] + | [bB][iI] + | [bB][iI][zZ] + | [bB][jJ] + | [bB][mM] + | [bB][nN] + | [bB][oO] + | [bB][rR] + | [bB][sS] + | [bB][tT] + | [bB][vV] + | [bB][wW] + | [bB][yY] + | [bB][zZ] + | [cC][aA] + | [cC][aA][tT] + | [cC][cC] + | [cC][dD] + | [cC][fF] + | [cC][gG] + | [cC][hH] + | [cC][iI] + | [cC][kK] + | [cC][lL] + | [cC][mM] + | [cC][nN] + | [cC][oO] + | [cC][oO][mM] + | [cC][oO][oO][pP] + | [cC][rR] + | [cC][uU] + | [cC][vV] + | [cC][xX] + | [cC][yY] + | [cC][zZ] + | [dD][eE] + | [dD][jJ] + | [dD][kK] + | [dD][mM] + | [dD][oO] + | [dD][zZ] + | [eE][cC] + | [eE][dD][uU] + | [eE][eE] + | [eE][gG] + | [eE][rR] + | [eE][sS] + | [eE][tT] + | [eE][uU] + | [fF][iI] + | [fF][jJ] + | [fF][kK] + | [fF][mM] + | [fF][oO] + | [fF][rR] + | [gG][aA] + | [gG][bB] + | [gG][dD] + | [gG][eE] + | [gG][fF] + | [gG][gG] + | [gG][hH] + | [gG][iI] + | [gG][lL] + | [gG][mM] + | [gG][nN] + | [gG][oO][vV] + | [gG][pP] + | [gG][qQ] + | [gG][rR] + | [gG][sS] + | [gG][tT] + | [gG][uU] + | [gG][wW] + | [gG][yY] + | [hH][kK] + | [hH][mM] + | [hH][nN] + | [hH][rR] + | [hH][tT] + | [hH][uU] + | [iI][dD] + | [iI][eE] + | [iI][lL] + | [iI][mM] + | [iI][nN] + | [iI][nN][fF][oO] + | [iI][nN][tT] + | [iI][oO] + | [iI][qQ] + | [iI][rR] + | [iI][sS] + | [iI][tT] + | [jJ][eE] + | [jJ][mM] + | [jJ][oO] + | [jJ][oO][bB][sS] + | [jJ][pP] + | [kK][eE] + | [kK][gG] + | [kK][hH] + | [kK][iI] + | [kK][mM] + | [kK][nN] + | [kK][pP] + | [kK][rR] + | [kK][wW] + | [kK][yY] + | [kK][zZ] + | [lL][aA] + | [lL][bB] + | [lL][cC] + | [lL][iI] + | [lL][kK] + | [lL][rR] + | [lL][sS] + | [lL][tT] + | [lL][uU] + | [lL][vV] + | [lL][yY] + | [mM][aA] + | [mM][cC] + | [mM][dD] + | [mM][eE] + | [mM][gG] + | [mM][hH] + | [mM][iI][lL] + | [mM][kK] + | [mM][lL] + | [mM][mM] + | [mM][nN] + | [mM][oO] + | [mM][oO][bB][iI] + | [mM][pP] + | [mM][qQ] + | [mM][rR] + | [mM][sS] + | [mM][tT] + | [mM][uU] + | [mM][uU][sS][eE][uU][mM] + | [mM][vV] + | [mM][wW] + | [mM][xX] + | [mM][yY] + | [mM][zZ] + | [nN][aA] + | [nN][aA][mM][eE] + | [nN][cC] + | [nN][eE] + | [nN][eE][tT] + | [nN][fF] + | [nN][gG] + | [nN][iI] + | [nN][lL] + | [nN][oO] + | [nN][pP] + | [nN][rR] + | [nN][uU] + | [nN][zZ] + | [oO][mM] + | [oO][rR][gG] + | [pP][aA] + | [pP][eE] + | [pP][fF] + | [pP][gG] + | [pP][hH] + | [pP][kK] + | [pP][lL] + | [pP][mM] + | [pP][nN] + | [pP][rR] + | [pP][rR][oO] + | [pP][sS] + | [pP][tT] + | [pP][wW] + | [pP][yY] + | [qQ][aA] + | [rR][eE] + | [rR][oO] + | [rR][sS] + | [rR][uU] + | [rR][wW] + | [sS][aA] + | [sS][bB] + | [sS][cC] + | [sS][dD] + | [sS][eE] + | [sS][gG] + | [sS][hH] + | [sS][iI] + | [sS][jJ] + | [sS][kK] + | [sS][lL] + | [sS][mM] + | [sS][nN] + | [sS][oO] + | [sS][rR] + | [sS][tT] + | [sS][uU] + | [sS][vV] + | [sS][yY] + | [sS][zZ] + | [tT][cC] + | [tT][dD] + | [tT][eE][lL] + | [tT][fF] + | [tT][gG] + | [tT][hH] + | [tT][jJ] + | [tT][kK] + | [tT][lL] + | [tT][mM] + | [tT][nN] + | [tT][oO] + | [tT][pP] + | [tT][rR] + | [tT][rR][aA][vV][eE][lL] + | [tT][tT] + | [tT][vV] + | [tT][wW] + | [tT][zZ] + | [uU][aA] + | [uU][gG] + | [uU][kK] + | [uU][sS] + | [uU][yY] + | [uU][zZ] + | [vV][aA] + | [vV][cC] + | [vV][eE] + | [vV][gG] + | [vV][iI] + | [vV][nN] + | [vV][uU] + | [wW][fF] + | [wW][sS] + | [xX][nN]--0[zZ][wW][mM]56[dD] + | [xX][nN]--11[bB]5[bB][sS]3[aA]9[aA][jJ]6[gG] + | [xX][nN]--80[aA][kK][hH][bB][yY][kK][nN][jJ]4[fF] + | [xX][nN]--9[tT]4[bB]11[yY][iI]5[aA] + | [xX][nN]--[dD][eE][bB][aA]0[aA][dD] + | [xX][nN]--[gG]6[wW]251[dD] + | [xX][nN]--[hH][gG][bB][kK]6[aA][jJ]7[fF]53[bB][bB][aA] + | [xX][nN]--[hH][lL][cC][jJ]6[aA][yY][aA]9[eE][sS][cC]7[aA] + | [xX][nN]--[jJ][xX][aA][lL][pP][dD][lL][pP] + | [xX][nN]--[kK][gG][bB][eE][cC][hH][tT][vV] + | [xX][nN]--[mM][gG][bB][aA][aA][mM]7[aA]8[hH] + | [xX][nN]--[mM][gG][bB][eE][rR][pP]4[aA]5[dD]4[aA][rR] + | [xX][nN]--[pP]1[aA][iI] + | [xX][nN]--[wW][gG][bB][hH]1[cC] + | [xX][nN]--[zZ][cC][kK][zZ][aA][hH] + | [yY][eE] + | [yY][tT] + | [zZ][aA] + | [zZ][mM] + | [zZ][wW] + ) \\.? // Accept trailing root (empty) domain + Index: modules/analysis/common/src/tools/java/org/apache/lucene/analysis/standard/GenerateJflexTLDMacros.java =================================================================== --- modules/analysis/common/src/tools/java/org/apache/lucene/analysis/standard/GenerateJflexTLDMacros.java (revision 0) +++ modules/analysis/common/src/tools/java/org/apache/lucene/analysis/standard/GenerateJflexTLDMacros.java (revision 0) @@ -0,0 +1,167 @@ +package org.apache.lucene.analysis.standard; + +/* + * Copyright 2001-2005 The Apache Software Foundation. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.*; +import java.net.*; +import java.text.DateFormat; +import java.util.*; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +/** + * Generates a file containing JFlex macros to accept valid ASCII TLDs + * (top level domains), for inclusion in JFlex grammars that can accept + * domain names. + *

+ * The IANA Root Zone Database is queried via HTTP from cmdline arg0, + * the response is parsed, and + * the results are written out to a file containing a JFlex macro that will + * accept all valid ASCII-only TLDs, including punycode forms of + * internationalized TLDs (output file cmdline arg1). + */ +public class GenerateJflexTLDMacros { + + public static void main(String... args) throws Exception { + if (args.length != 2) { + System.err.println("Cmd line params:"); + System.err.println(" java " + GenerateJflexTLDMacros.class.getName() + " "); + System.exit(1); + } + new GenerateJflexTLDMacros(args[0], args[1]).execute(); + } + + private static final String NL = System.getProperty("line.separator"); + + private static final String APACHE_LICENSE + = "/*" + NL + + " * Copyright 2001-2005 The Apache Software Foundation." + NL + + " *" + NL + + " * Licensed under the Apache License, Version 2.0 (the \"License\");" + NL + + " * you may not use this file except in compliance with the License." + NL + + " * You may obtain a copy of the License at" + NL + + " *" + NL + + " * http://www.apache.org/licenses/LICENSE-2.0" + NL + + " *" + NL + + " * Unless required by applicable law or agreed to in writing, software" + NL + + " * distributed under the License is distributed on an \"AS IS\" BASIS," + NL + + " * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied." + NL + + " * See the License for the specific language governing permissions and" + NL + + " * limitations under the License." + NL + + " */" + NL + NL; + + private final Pattern TLD_PATTERN = Pattern.compile("([\\-A-Za-z0-9]+)\\. NS .*"); + + private final URL tldFileURL; + private long tldFileLastModified = -1L; + private final File outputFile; + + public GenerateJflexTLDMacros(String tldFile, String outputFile) throws Exception { + this.tldFileURL=new URL(tldFile); + this.outputFile=new File(outputFile); + } + + public void execute() throws IOException { + final SortedSet tlds = getIANARootZoneDatabase(); + writeOutput(tlds); + System.err.println("Wrote " + tlds.size() + " top level domains to '" + outputFile + "'."); + } + + /** + * Downloads the IANA Root Zone Database. + */ + private SortedSet getIANARootZoneDatabase() throws IOException { + final SortedSet tlds = new TreeSet(); + final URLConnection conn = tldFileURL.openConnection(); + conn.setUseCaches(false); + conn.addRequestProperty("Cache-Control", "no-cache"); + conn.connect(); + tldFileLastModified = conn.getLastModified(); + BufferedReader r = new BufferedReader(new InputStreamReader(conn.getInputStream(), "US-ASCII")); + try { + String line; + while ((line=r.readLine())!=null) { + final Matcher matcher = TLD_PATTERN.matcher(line); + if (matcher.matches()) { + tlds.add(matcher.group(1).toLowerCase(Locale.US)); + } + } + return tlds; + } finally { + r.close(); + } + } + + private void writeOutput(SortedSet tlds) throws IOException { + final DateFormat df = DateFormat.getDateTimeInstance(DateFormat.DEFAULT, DateFormat.DEFAULT, Locale.US); + df.setTimeZone(TimeZone.getTimeZone("UTC")); + final Writer writer = new OutputStreamWriter(new FileOutputStream(outputFile), "UTF-8"); + try { + writer.write(APACHE_LICENSE); + writer.write("// Generated from IANA Root Zone Database <"); + writer.write(tldFileURL.toString()); + writer.write(">"); + writer.write(NL); + if (tldFileLastModified > 0L) { + writer.write("// file version from "); + writer.write(df.format(tldFileLastModified)); + writer.write(NL); + } + writer.write("// generated on "); + writer.write(df.format(new Date())); + writer.write(NL); + writer.write("// by "); + writer.write(this.getClass().getName()); + writer.write(NL); + writer.write(NL); + writer.write("ASCIITLD = \\\\. ("); + writer.write(NL); + boolean isFirst = true; + for (String tld : tlds) { + String ASCIITLDregex = getCaseInsensitiveRegex(tld); + writer.write("\t"); + if (isFirst) { + isFirst = false; + writer.write(" "); + } else { + writer.write("| "); + } + writer.write(ASCIITLDregex); + writer.write(NL); + } + writer.write("\t) \\\\.? // Accept trailing root (empty) domain"); + writer.write(NL); + writer.write(NL); + } finally { + writer.close(); + } + } + + private String getCaseInsensitiveRegex(String ASCIITLD) { + StringBuilder builder = new StringBuilder(); + for (int pos = 0 ; pos < ASCIITLD.length() ; ++pos) { + String ch = ASCIITLD.substring(pos, pos + 1); + if (Character.isDigit(ch.charAt(0)) || ch.equals("-")) { + builder.append(ch); + } else { + builder.append("[").append(ch).append(ch.toUpperCase()).append("]"); + } + } + return builder.toString(); + } + +} Property changes on: modules\analysis\common\src\tools\java\org\apache\lucene\analysis\standard\GenerateJflexTLDMacros.java ___________________________________________________________________ Added: svn:keywords + Date Author Id Revision HeadURL Added: svn:eol-style + native