var nonLatinHashtagChars = [];
// Cyrillic
addCharsToCharClass(nonLatinHashtagChars, 0x0400, 0x04ff); // Cyrillic
addCharsToCharClass(nonLatinHashtagChars, 0x0500, 0x0527); // Cyrillic Supplement
addCharsToCharClass(nonLatinHashtagChars, 0x2de0, 0x2dff); // Cyrillic Extended A
addCharsToCharClass(nonLatinHashtagChars, 0xa640, 0xa69f); // Cyrillic Extended B
// Hebrew
addCharsToCharClass(nonLatinHashtagChars, 0x0591, 0x05bf); // Hebrew
addCharsToCharClass(nonLatinHashtagChars, 0x05c1, 0x05c2);
addCharsToCharClass(nonLatinHashtagChars, 0x05c4, 0x05c5);
addCharsToCharClass(nonLatinHashtagChars, 0x05c7, 0x05c7);
addCharsToCharClass(nonLatinHashtagChars, 0x05d0, 0x05ea);
addCharsToCharClass(nonLatinHashtagChars, 0x05f0, 0x05f4);
addCharsToCharClass(nonLatinHashtagChars, 0xfb12, 0xfb28); // Hebrew Presentation Forms
addCharsToCharClass(nonLatinHashtagChars, 0xfb2a, 0xfb36);
addCharsToCharClass(nonLatinHashtagChars, 0xfb38, 0xfb3c);
addCharsToCharClass(nonLatinHashtagChars, 0xfb3e, 0xfb3e);
addCharsToCharClass(nonLatinHashtagChars, 0xfb40, 0xfb41);
addCharsToCharClass(nonLatinHashtagChars, 0xfb43, 0xfb44);
addCharsToCharClass(nonLatinHashtagChars, 0xfb46, 0xfb4f);
// Arabic
addCharsToCharClass(nonLatinHashtagChars, 0x0610, 0x061a); // Arabic
addCharsToCharClass(nonLatinHashtagChars, 0x0620, 0x065f);
addCharsToCharClass(nonLatinHashtagChars, 0x066e, 0x06d3);
addCharsToCharClass(nonLatinHashtagChars, 0x06d5, 0x06dc);
addCharsToCharClass(nonLatinHashtagChars, 0x06de, 0x06e8);
addCharsToCharClass(nonLatinHashtagChars, 0x06ea, 0x06ef);
addCharsToCharClass(nonLatinHashtagChars, 0x06fa, 0x06fc);
addCharsToCharClass(nonLatinHashtagChars, 0x06ff, 0x06ff);
addCharsToCharClass(nonLatinHashtagChars, 0x0750, 0x077f); // Arabic Supplement
addCharsToCharClass(nonLatinHashtagChars, 0x08a0, 0x08a0); // Arabic Extended A
addCharsToCharClass(nonLatinHashtagChars, 0x08a2, 0x08ac);
addCharsToCharClass(nonLatinHashtagChars, 0x08e4, 0x08fe);
addCharsToCharClass(nonLatinHashtagChars, 0xfb50, 0xfbb1); // Arabic Pres. Forms A
addCharsToCharClass(nonLatinHashtagChars, 0xfbd3, 0xfd3d);
addCharsToCharClass(nonLatinHashtagChars, 0xfd50, 0xfd8f);
addCharsToCharClass(nonLatinHashtagChars, 0xfd92, 0xfdc7);
addCharsToCharClass(nonLatinHashtagChars, 0xfdf0, 0xfdfb);
addCharsToCharClass(nonLatinHashtagChars, 0xfe70, 0xfe74); // Arabic Pres. Forms B
addCharsToCharClass(nonLatinHashtagChars, 0xfe76, 0xfefc);
addCharsToCharClass(nonLatinHashtagChars, 0x200c, 0x200c); // Zero-Width Non-Joiner
// Thai
addCharsToCharClass(nonLatinHashtagChars, 0x0e01, 0x0e3a);
addCharsToCharClass(nonLatinHashtagChars, 0x0e40, 0x0e4e);
// Hangul (Korean)
addCharsToCharClass(nonLatinHashtagChars, 0x1100, 0x11ff); // Hangul Jamo
addCharsToCharClass(nonLatinHashtagChars, 0x3130, 0x3185); // Hangul Compatibility Jamo
addCharsToCharClass(nonLatinHashtagChars, 0xA960, 0xA97F); // Hangul Jamo Extended-A
addCharsToCharClass(nonLatinHashtagChars, 0xAC00, 0xD7AF); // Hangul Syllables
addCharsToCharClass(nonLatinHashtagChars, 0xD7B0, 0xD7FF); // Hangul Jamo Extended-B
addCharsToCharClass(nonLatinHashtagChars, 0xFFA1, 0xFFDC); // half-width Hangul
// Japanese and Chinese
addCharsToCharClass(nonLatinHashtagChars, 0x30A1, 0x30FA); // Katakana (full-width)
addCharsToCharClass(nonLatinHashtagChars, 0x30FC, 0x30FE); // Katakana Chouon and iteration marks (full-width)
addCharsToCharClass(nonLatinHashtagChars, 0xFF66, 0xFF9F); // Katakana (half-width)
addCharsToCharClass(nonLatinHashtagChars, 0xFF70, 0xFF70); // Katakana Chouon (half-width)
addCharsToCharClass(nonLatinHashtagChars, 0xFF10, 0xFF19); // \
addCharsToCharClass(nonLatinHashtagChars, 0xFF21, 0xFF3A); // - Latin (full-width)
addCharsToCharClass(nonLatinHashtagChars, 0xFF41, 0xFF5A); // /
addCharsToCharClass(nonLatinHashtagChars, 0x3041, 0x3096); // Hiragana
addCharsToCharClass(nonLatinHashtagChars, 0x3099, 0x309E); // Hiragana voicing and iteration mark
addCharsToCharClass(nonLatinHashtagChars, 0x3400, 0x4DBF); // Kanji (CJK Extension A)
addCharsToCharClass(nonLatinHashtagChars, 0x4E00, 0x9FFF); // Kanji (Unified)
// -- Disabled as it breaks the Regex.
//addCharsToCharClass(nonLatinHashtagChars, 0x20000, 0x2A6DF); // Kanji (CJK Extension B)
addCharsToCharClass(nonLatinHashtagChars, 0x2A700, 0x2B73F); // Kanji (CJK Extension C)
addCharsToCharClass(nonLatinHashtagChars, 0x2B740, 0x2B81F); // Kanji (CJK Extension D)
addCharsToCharClass(nonLatinHashtagChars, 0x2F800, 0x2FA1F); // Kanji (CJK supplement)
addCharsToCharClass(nonLatinHashtagChars, 0x3003, 0x3003); // Kanji iteration mark
addCharsToCharClass(nonLatinHashtagChars, 0x3005, 0x3005); // Kanji iteration mark
addCharsToCharClass(nonLatinHashtagChars, 0x303B, 0x303B); // Han iteration mark
twttr.txt.regexen.nonLatinHashtagChars = regexSupplant(nonLatinHashtagChars.join(""));
var nonLatinHashtagChars = regenerate()
// Cyrillic
.addRange(0x0400, 0x04FF) // Cyrillic
.addRange(0x0500, 0x0527) // Cyrillic Supplement
.addRange(0x2DE0, 0x2DFF) // Cyrillic Extended A
.addRange(0xA640, 0xA69F) // Cyrillic Extended B
// Hebrew
.addRange(0x0591, 0x05BF) // Hebrew
.addRange(0x05C1, 0x05C2)
.addRange(0x05C4, 0x05C5)
.add(0x05c7)
.addRange(0x05D0, 0x05EA)
.addRange(0x05F0, 0x05F4)
.addRange(0xFB12, 0xFB28) // Hebrew Presentation Forms
.addRange(0xFB2A, 0xFB36)
.addRange(0xFB38, 0xFB3C)
.addRange(0xFB3E, 0xFB3E)
.addRange(0xFB40, 0xFB41)
.addRange(0xFB43, 0xFB44)
.addRange(0xFB46, 0xFB4F)
// Arabic
.addRange(0x0610, 0x061A) // Arabic
.addRange(0x0620, 0x065F)
.addRange(0x066E, 0x06D3)
.addRange(0x06D5, 0x06DC)
.addRange(0x06DE, 0x06E8)
.addRange(0x06EA, 0x06EF)
.addRange(0x06FA, 0x06FC)
.addRange(0x06FF, 0x06FF)
.addRange(0x0750, 0x077F) // Arabic Supplement
.addRange(0x08A0, 0x08A0) // Arabic Extended A
.addRange(0x08A2, 0x08AC)
.addRange(0x08E4, 0x08FE)
.addRange(0xFB50, 0xFBB1) // Arabic Pres. Forms A
.addRange(0xFBD3, 0xFD3D)
.addRange(0xFD50, 0xFD8F)
.addRange(0xFD92, 0xFDC7)
.addRange(0xFDF0, 0xFDFB)
.addRange(0xFE70, 0xFE74) // Arabic Pres. Forms B
.addRange(0xFE76, 0xFEFC)
.addRange(0x200C, 0x200C) // Zero-Width Non-Joiner
// Thai
.addRange(0x0E01, 0x0E3A)
.addRange(0x0E40, 0x0E4E)
// Hangul (Korean)
.addRange(0x1100, 0x11FF) // Hangul Jamo
.addRange(0x3130, 0x3185) // Hangul Compatibility Jamo
.addRange(0xA960, 0xA97F) // Hangul Jamo Extended-A
.addRange(0xAC00, 0xD7AF) // Hangul Syllables
.addRange(0xD7B0, 0xD7FF) // Hangul Jamo Extended-B
.addRange(0xFFA1, 0xFFDC) // half-width Hangul
// Japanese and Chinese
.addRange(0x30A1, 0x30FA) // Katakana (full-width)
.addRange(0x30FC, 0x30FE) // Katakana Chouon and iteration marks (full-width)
.addRange(0xFF66, 0xFF9F) // Katakana (half-width)
.add(0xFF70) // Katakana Chouon (half-width)
.addRange(0xFF10, 0xFF19) // \
.addRange(0xFF21, 0xFF3A) // - Latin (full-width)
.addRange(0xFF41, 0xFF5A) // /
.addRange(0x3041, 0x3096) // Hiragana
.addRange(0x3099, 0x309E) // Hiragana voicing and iteration mark
.addRange(0x3400, 0x4DBF) // Kanji (CJK Extension A)
.addRange(0x4E00, 0x9FFF) // Kanji (Unified)
.addRange(0x20000, 0x2A6DF) // Kanji (CJK Extension B)
.addRange(0x2A700, 0x2B73F) // Kanji (CJK Extension C)
.addRange(0x2B740, 0x2B81F) // Kanji (CJK Extension D)
.addRange(0x2F800, 0x2FA1F) // Kanji (CJK supplement)
.add(0x3003) // Kanji iteration mark
.add(0x3005) // Kanji iteration mark
.add(0x303B); // Han iteration mark
twttr.txt.regexen.nonLatinHashtagChars = nonLatinHashtagChars.toRegExp();
But it would be even better to not do it at runtime, but as part of a build process:
This way, the source code (before building) is still very readable/maintainable, but the built code is optimized for run-time performance.
Would you be interested in a pull request that ports all the regular expressions to Regenerate + adds a simple build script?