match_test.go 13 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505
  1. // Copyright 2013 The Go Authors. All rights reserved.
  2. // Use of this source code is governed by a BSD-style
  3. // license that can be found in the LICENSE file.
  4. package language
  5. import (
  6. "bytes"
  7. "flag"
  8. "fmt"
  9. "os"
  10. "path"
  11. "path/filepath"
  12. "strings"
  13. "testing"
  14. "golang.org/x/text/internal/testtext"
  15. "golang.org/x/text/internal/ucd"
  16. )
  17. var verbose = flag.Bool("verbose", false, "set to true to print the internal tables of matchers")
  18. func TestCompliance(t *testing.T) {
  19. filepath.Walk("testdata", func(file string, info os.FileInfo, err error) error {
  20. if info.IsDir() {
  21. return nil
  22. }
  23. r, err := os.Open(file)
  24. if err != nil {
  25. t.Fatal(err)
  26. }
  27. ucd.Parse(r, func(p *ucd.Parser) {
  28. name := strings.Replace(path.Join(p.String(0), p.String(1)), " ", "", -1)
  29. if skip[name] {
  30. return
  31. }
  32. t.Run(info.Name()+"/"+name, func(t *testing.T) {
  33. supported := makeTagList(p.String(0))
  34. desired := makeTagList(p.String(1))
  35. gotCombined, index, conf := NewMatcher(supported).Match(desired...)
  36. gotMatch := supported[index]
  37. wantMatch := mk(p.String(2))
  38. if gotMatch != wantMatch {
  39. t.Fatalf("match: got %q; want %q (%v)", gotMatch, wantMatch, conf)
  40. }
  41. wantCombined, err := Raw.Parse(p.String(3))
  42. if err == nil && gotCombined != wantCombined {
  43. t.Errorf("combined: got %q; want %q (%v)", gotCombined, wantCombined, conf)
  44. }
  45. })
  46. })
  47. return nil
  48. })
  49. }
  50. var skip = map[string]bool{
  51. // TODO: bugs
  52. // Honor the wildcard match. This may only be useful to select non-exact
  53. // stuff.
  54. "mul,af/nl": true, // match: got "af"; want "mul"
  55. // TODO: include other extensions.
  56. // combined: got "en-GB-u-ca-buddhist-nu-arab"; want "en-GB-fonipa-t-m0-iso-i0-pinyin-u-ca-buddhist-nu-arab"
  57. "und,en-GB-u-sd-gbsct/en-fonipa-u-nu-Arab-ca-buddhist-t-m0-iso-i0-pinyin": true,
  58. // Inconsistencies with Mark Davis' implementation where it is not clear
  59. // which is better.
  60. // Inconsistencies in combined. I think the Go approach is more appropriate.
  61. // We could use -u-rg- and -u-va- as alternative.
  62. "und,fr/fr-BE-fonipa": true, // combined: got "fr"; want "fr-BE-fonipa"
  63. "und,fr-CA/fr-BE-fonipa": true, // combined: got "fr-CA"; want "fr-BE-fonipa"
  64. "und,fr-fonupa/fr-BE-fonipa": true, // combined: got "fr-fonupa"; want "fr-BE-fonipa"
  65. "und,no/nn-BE-fonipa": true, // combined: got "no"; want "no-BE-fonipa"
  66. "50,und,fr-CA-fonupa/fr-BE-fonipa": true, // combined: got "fr-CA-fonupa"; want "fr-BE-fonipa"
  67. // The initial number is a threshold. As we don't use scoring, we will not
  68. // implement this.
  69. "50,und,fr-Cyrl-CA-fonupa/fr-BE-fonipa": true,
  70. // match: got "und"; want "fr-Cyrl-CA-fonupa"
  71. // combined: got "und"; want "fr-Cyrl-BE-fonipa"
  72. // Other interesting cases to test:
  73. // - Should same language or same script have the preference if there is
  74. // usually no understanding of the other script?
  75. // - More specific region in desired may replace enclosing supported.
  76. }
  77. func makeTagList(s string) (tags []Tag) {
  78. for _, s := range strings.Split(s, ",") {
  79. tags = append(tags, mk(strings.TrimSpace(s)))
  80. }
  81. return tags
  82. }
  83. func TestMatchStrings(t *testing.T) {
  84. testCases := []struct {
  85. supported string
  86. desired string // strings separted by |
  87. tag string
  88. index int
  89. }{{
  90. supported: "en",
  91. desired: "",
  92. tag: "en",
  93. index: 0,
  94. }, {
  95. supported: "en",
  96. desired: "nl",
  97. tag: "en",
  98. index: 0,
  99. }, {
  100. supported: "en,nl",
  101. desired: "nl",
  102. tag: "nl",
  103. index: 1,
  104. }, {
  105. supported: "en,nl",
  106. desired: "nl|en",
  107. tag: "nl",
  108. index: 1,
  109. }, {
  110. supported: "en-GB,nl",
  111. desired: "en ; q=0.1,nl",
  112. tag: "nl",
  113. index: 1,
  114. }, {
  115. supported: "en-GB,nl",
  116. desired: "en;q=0.005 | dk; q=0.1,nl ",
  117. tag: "en-GB",
  118. index: 0,
  119. }, {
  120. // do not match faulty tags with und
  121. supported: "en,und",
  122. desired: "|en",
  123. tag: "en",
  124. index: 0,
  125. }}
  126. for _, tc := range testCases {
  127. t.Run(path.Join(tc.supported, tc.desired), func(t *testing.T) {
  128. m := NewMatcher(makeTagList(tc.supported))
  129. tag, index := MatchStrings(m, strings.Split(tc.desired, "|")...)
  130. if tag.String() != tc.tag || index != tc.index {
  131. t.Errorf("got %v, %d; want %v, %d", tag, index, tc.tag, tc.index)
  132. }
  133. })
  134. }
  135. }
  136. func TestAddLikelySubtags(t *testing.T) {
  137. tests := []struct{ in, out string }{
  138. {"aa", "aa-Latn-ET"},
  139. {"aa-Latn", "aa-Latn-ET"},
  140. {"aa-Arab", "aa-Arab-ET"},
  141. {"aa-Arab-ER", "aa-Arab-ER"},
  142. {"kk", "kk-Cyrl-KZ"},
  143. {"kk-CN", "kk-Arab-CN"},
  144. {"cmn", "cmn"},
  145. {"zh-AU", "zh-Hant-AU"},
  146. {"zh-VN", "zh-Hant-VN"},
  147. {"zh-SG", "zh-Hans-SG"},
  148. {"zh-Hant", "zh-Hant-TW"},
  149. {"zh-Hani", "zh-Hani-CN"},
  150. {"und-Hani", "zh-Hani-CN"},
  151. {"und", "en-Latn-US"},
  152. {"und-GB", "en-Latn-GB"},
  153. {"und-CW", "pap-Latn-CW"},
  154. {"und-YT", "fr-Latn-YT"},
  155. {"und-Arab", "ar-Arab-EG"},
  156. {"und-AM", "hy-Armn-AM"},
  157. {"und-TW", "zh-Hant-TW"},
  158. {"und-002", "en-Latn-NG"},
  159. {"und-Latn-002", "en-Latn-NG"},
  160. {"en-Latn-002", "en-Latn-NG"},
  161. {"en-002", "en-Latn-NG"},
  162. {"en-001", "en-Latn-US"},
  163. {"und-003", "en-Latn-US"},
  164. {"und-GB", "en-Latn-GB"},
  165. {"Latn-001", "en-Latn-US"},
  166. {"en-001", "en-Latn-US"},
  167. {"es-419", "es-Latn-419"},
  168. {"he-145", "he-Hebr-IL"},
  169. {"ky-145", "ky-Latn-TR"},
  170. {"kk", "kk-Cyrl-KZ"},
  171. // Don't specialize duplicate and ambiguous matches.
  172. {"kk-034", "kk-Arab-034"}, // Matches IR and AF. Both are Arab.
  173. {"ku-145", "ku-Latn-TR"}, // Matches IQ, TR, and LB, but kk -> TR.
  174. {"und-Arab-CC", "ms-Arab-CC"},
  175. {"und-Arab-GB", "ks-Arab-GB"},
  176. {"und-Hans-CC", "zh-Hans-CC"},
  177. {"und-CC", "en-Latn-CC"},
  178. {"sr", "sr-Cyrl-RS"},
  179. {"sr-151", "sr-Latn-151"}, // Matches RO and RU.
  180. // We would like addLikelySubtags to generate the same results if the input
  181. // only changes by adding tags that would otherwise have been added
  182. // by the expansion.
  183. // In other words:
  184. // und-AA -> xx-Scrp-AA implies und-Scrp-AA -> xx-Scrp-AA
  185. // und-AA -> xx-Scrp-AA implies xx-AA -> xx-Scrp-AA
  186. // und-Scrp -> xx-Scrp-AA implies und-Scrp-AA -> xx-Scrp-AA
  187. // und-Scrp -> xx-Scrp-AA implies xx-Scrp -> xx-Scrp-AA
  188. // xx -> xx-Scrp-AA implies xx-Scrp -> xx-Scrp-AA
  189. // xx -> xx-Scrp-AA implies xx-AA -> xx-Scrp-AA
  190. //
  191. // The algorithm specified in
  192. // http://unicode.org/reports/tr35/tr35-9.html#Supplemental_Data,
  193. // Section C.10, does not handle the first case. For example,
  194. // the CLDR data contains an entry und-BJ -> fr-Latn-BJ, but not
  195. // there is no rule for und-Latn-BJ. According to spec, und-Latn-BJ
  196. // would expand to en-Latn-BJ, violating the aforementioned principle.
  197. // We deviate from the spec by letting und-Scrp-AA expand to xx-Scrp-AA
  198. // if a rule of the form und-AA -> xx-Scrp-AA is defined.
  199. // Note that as of version 23, CLDR has some explicitly specified
  200. // entries that do not conform to these rules. The implementation
  201. // will not correct these explicit inconsistencies. A later versions of CLDR
  202. // is supposed to fix this.
  203. {"und-Latn-BJ", "fr-Latn-BJ"},
  204. {"und-Bugi-ID", "bug-Bugi-ID"},
  205. // regions, scripts and languages without definitions
  206. {"und-Arab-AA", "ar-Arab-AA"},
  207. {"und-Afak-RE", "fr-Afak-RE"},
  208. {"und-Arab-GB", "ks-Arab-GB"},
  209. {"abp-Arab-GB", "abp-Arab-GB"},
  210. // script has preference over region
  211. {"und-Arab-NL", "ar-Arab-NL"},
  212. {"zza", "zza-Latn-TR"},
  213. // preserve variants and extensions
  214. {"de-1901", "de-Latn-DE-1901"},
  215. {"de-x-abc", "de-Latn-DE-x-abc"},
  216. {"de-1901-x-abc", "de-Latn-DE-1901-x-abc"},
  217. {"x-abc", "x-abc"}, // TODO: is this the desired behavior?
  218. }
  219. for i, tt := range tests {
  220. in, _ := Parse(tt.in)
  221. out, _ := Parse(tt.out)
  222. in, _ = in.addLikelySubtags()
  223. if in.String() != out.String() {
  224. t.Errorf("%d: add(%s) was %s; want %s", i, tt.in, in, tt.out)
  225. }
  226. }
  227. }
  228. func TestMinimize(t *testing.T) {
  229. tests := []struct{ in, out string }{
  230. {"aa", "aa"},
  231. {"aa-Latn", "aa"},
  232. {"aa-Latn-ET", "aa"},
  233. {"aa-ET", "aa"},
  234. {"aa-Arab", "aa-Arab"},
  235. {"aa-Arab-ER", "aa-Arab-ER"},
  236. {"aa-Arab-ET", "aa-Arab"},
  237. {"und", "und"},
  238. {"und-Latn", "und"},
  239. {"und-Latn-US", "und"},
  240. {"en-Latn-US", "en"},
  241. {"cmn", "cmn"},
  242. {"cmn-Hans", "cmn-Hans"},
  243. {"cmn-Hant", "cmn-Hant"},
  244. {"zh-AU", "zh-AU"},
  245. {"zh-VN", "zh-VN"},
  246. {"zh-SG", "zh-SG"},
  247. {"zh-Hant", "zh-Hant"},
  248. {"zh-Hant-TW", "zh-TW"},
  249. {"zh-Hans", "zh"},
  250. {"zh-Hani", "zh-Hani"},
  251. {"und-Hans", "und-Hans"},
  252. {"und-Hani", "und-Hani"},
  253. {"und-CW", "und-CW"},
  254. {"und-YT", "und-YT"},
  255. {"und-Arab", "und-Arab"},
  256. {"und-AM", "und-AM"},
  257. {"und-Arab-CC", "und-Arab-CC"},
  258. {"und-CC", "und-CC"},
  259. {"und-Latn-BJ", "und-BJ"},
  260. {"und-Bugi-ID", "und-Bugi"},
  261. {"bug-Bugi-ID", "bug-Bugi"},
  262. // regions, scripts and languages without definitions
  263. {"und-Arab-AA", "und-Arab-AA"},
  264. // preserve variants and extensions
  265. {"de-Latn-1901", "de-1901"},
  266. {"de-Latn-x-abc", "de-x-abc"},
  267. {"de-DE-1901-x-abc", "de-1901-x-abc"},
  268. {"x-abc", "x-abc"}, // TODO: is this the desired behavior?
  269. }
  270. for i, tt := range tests {
  271. in, _ := Parse(tt.in)
  272. out, _ := Parse(tt.out)
  273. min, _ := in.minimize()
  274. if min.String() != out.String() {
  275. t.Errorf("%d: min(%s) was %s; want %s", i, tt.in, min, tt.out)
  276. }
  277. max, _ := min.addLikelySubtags()
  278. if x, _ := in.addLikelySubtags(); x.String() != max.String() {
  279. t.Errorf("%d: max(min(%s)) = %s; want %s", i, tt.in, max, x)
  280. }
  281. }
  282. }
  283. func TestRegionGroups(t *testing.T) {
  284. testCases := []struct {
  285. a, b string
  286. distance uint8
  287. }{
  288. {"zh-TW", "zh-HK", 5},
  289. {"zh-MO", "zh-HK", 4},
  290. {"es-ES", "es-AR", 5},
  291. {"es-ES", "es", 4},
  292. {"es-419", "es-MX", 4},
  293. {"es-AR", "es-MX", 4},
  294. {"es-ES", "es-MX", 5},
  295. {"es-PT", "es-MX", 5},
  296. }
  297. for _, tc := range testCases {
  298. a := MustParse(tc.a)
  299. aScript, _ := a.Script()
  300. b := MustParse(tc.b)
  301. bScript, _ := b.Script()
  302. if aScript != bScript {
  303. t.Errorf("scripts differ: %q vs %q", aScript, bScript)
  304. continue
  305. }
  306. d, _ := regionGroupDist(a.region, b.region, aScript.scriptID, a.lang)
  307. if d != tc.distance {
  308. t.Errorf("got %q; want %q", d, tc.distance)
  309. }
  310. }
  311. }
  312. func TestIsParadigmLocale(t *testing.T) {
  313. testCases := map[string]bool{
  314. "en-US": true,
  315. "en-GB": true,
  316. "en-VI": false,
  317. "es-GB": false,
  318. "es-ES": true,
  319. "es-419": true,
  320. }
  321. for str, want := range testCases {
  322. tag := Make(str)
  323. got := isParadigmLocale(tag.lang, tag.region)
  324. if got != want {
  325. t.Errorf("isPL(%q) = %v; want %v", str, got, want)
  326. }
  327. }
  328. }
  329. // Implementation of String methods for various types for debugging purposes.
  330. func (m *matcher) String() string {
  331. w := &bytes.Buffer{}
  332. fmt.Fprintln(w, "Default:", m.default_)
  333. for tag, h := range m.index {
  334. fmt.Fprintf(w, " %s: %v\n", tag, h)
  335. }
  336. return w.String()
  337. }
  338. func (h *matchHeader) String() string {
  339. w := &bytes.Buffer{}
  340. fmt.Fprint(w, "haveTag: ")
  341. for _, h := range h.haveTags {
  342. fmt.Fprintf(w, "%v, ", h)
  343. }
  344. return w.String()
  345. }
  346. func (t haveTag) String() string {
  347. return fmt.Sprintf("%v:%d:%v:%v-%v|%v", t.tag, t.index, t.conf, t.maxRegion, t.maxScript, t.altScript)
  348. }
  349. func TestBestMatchAlloc(t *testing.T) {
  350. m := NewMatcher(makeTagList("en sr nl"))
  351. // Go allocates when creating a list of tags from a single tag!
  352. list := []Tag{English}
  353. avg := testtext.AllocsPerRun(1, func() {
  354. m.Match(list...)
  355. })
  356. if avg > 0 {
  357. t.Errorf("got %f; want 0", avg)
  358. }
  359. }
  360. var benchHave = []Tag{
  361. mk("en"),
  362. mk("en-GB"),
  363. mk("za"),
  364. mk("zh-Hant"),
  365. mk("zh-Hans-CN"),
  366. mk("zh"),
  367. mk("zh-HK"),
  368. mk("ar-MK"),
  369. mk("en-CA"),
  370. mk("fr-CA"),
  371. mk("fr-US"),
  372. mk("fr-CH"),
  373. mk("fr"),
  374. mk("lt"),
  375. mk("lv"),
  376. mk("iw"),
  377. mk("iw-NL"),
  378. mk("he"),
  379. mk("he-IT"),
  380. mk("tlh"),
  381. mk("ja"),
  382. mk("ja-Jpan"),
  383. mk("ja-Jpan-JP"),
  384. mk("de"),
  385. mk("de-CH"),
  386. mk("de-AT"),
  387. mk("de-DE"),
  388. mk("sr"),
  389. mk("sr-Latn"),
  390. mk("sr-Cyrl"),
  391. mk("sr-ME"),
  392. }
  393. var benchWant = [][]Tag{
  394. []Tag{
  395. mk("en"),
  396. },
  397. []Tag{
  398. mk("en-AU"),
  399. mk("de-HK"),
  400. mk("nl"),
  401. mk("fy"),
  402. mk("lv"),
  403. },
  404. []Tag{
  405. mk("en-AU"),
  406. mk("de-HK"),
  407. mk("nl"),
  408. mk("fy"),
  409. },
  410. []Tag{
  411. mk("ja-Hant"),
  412. mk("da-HK"),
  413. mk("nl"),
  414. mk("zh-TW"),
  415. },
  416. []Tag{
  417. mk("ja-Hant"),
  418. mk("da-HK"),
  419. mk("nl"),
  420. mk("hr"),
  421. },
  422. }
  423. func BenchmarkMatch(b *testing.B) {
  424. m := newMatcher(benchHave, nil)
  425. for i := 0; i < b.N; i++ {
  426. for _, want := range benchWant {
  427. m.getBest(want...)
  428. }
  429. }
  430. }
  431. func BenchmarkMatchExact(b *testing.B) {
  432. want := mk("en")
  433. m := newMatcher(benchHave, nil)
  434. for i := 0; i < b.N; i++ {
  435. m.getBest(want)
  436. }
  437. }
  438. func BenchmarkMatchAltLanguagePresent(b *testing.B) {
  439. want := mk("hr")
  440. m := newMatcher(benchHave, nil)
  441. for i := 0; i < b.N; i++ {
  442. m.getBest(want)
  443. }
  444. }
  445. func BenchmarkMatchAltLanguageNotPresent(b *testing.B) {
  446. want := mk("nn")
  447. m := newMatcher(benchHave, nil)
  448. for i := 0; i < b.N; i++ {
  449. m.getBest(want)
  450. }
  451. }
  452. func BenchmarkMatchAltScriptPresent(b *testing.B) {
  453. want := mk("zh-Hant-CN")
  454. m := newMatcher(benchHave, nil)
  455. for i := 0; i < b.N; i++ {
  456. m.getBest(want)
  457. }
  458. }
  459. func BenchmarkMatchAltScriptNotPresent(b *testing.B) {
  460. want := mk("fr-Cyrl")
  461. m := newMatcher(benchHave, nil)
  462. for i := 0; i < b.N; i++ {
  463. m.getBest(want)
  464. }
  465. }
  466. func BenchmarkMatchLimitedExact(b *testing.B) {
  467. want := []Tag{mk("he-NL"), mk("iw-NL")}
  468. m := newMatcher(benchHave, nil)
  469. for i := 0; i < b.N; i++ {
  470. m.getBest(want...)
  471. }
  472. }