match.go 29 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933
  1. // Copyright 2013 The Go Authors. All rights reserved.
  2. // Use of this source code is governed by a BSD-style
  3. // license that can be found in the LICENSE file.
  4. package language
  5. import "errors"
  6. // A MatchOption configures a Matcher.
  7. type MatchOption func(*matcher)
  8. // PreferSameScript will, in the absence of a match, result in the first
  9. // preferred tag with the same script as a supported tag to match this supported
  10. // tag. The default is currently true, but this may change in the future.
  11. func PreferSameScript(preferSame bool) MatchOption {
  12. return func(m *matcher) { m.preferSameScript = preferSame }
  13. }
  14. // TODO(v1.0.0): consider making Matcher a concrete type, instead of interface.
  15. // There doesn't seem to be too much need for multiple types.
  16. // Making it a concrete type allows MatchStrings to be a method, which will
  17. // improve its discoverability.
  18. // MatchStrings parses and matches the given strings until one of them matches
  19. // the language in the Matcher. A string may be an Accept-Language header as
  20. // handled by ParseAcceptLanguage. The default language is returned if no
  21. // other language matched.
  22. func MatchStrings(m Matcher, lang ...string) (tag Tag, index int) {
  23. for _, accept := range lang {
  24. desired, _, err := ParseAcceptLanguage(accept)
  25. if err != nil {
  26. continue
  27. }
  28. if tag, index, conf := m.Match(desired...); conf != No {
  29. return tag, index
  30. }
  31. }
  32. tag, index, _ = m.Match()
  33. return
  34. }
  35. // Matcher is the interface that wraps the Match method.
  36. //
  37. // Match returns the best match for any of the given tags, along with
  38. // a unique index associated with the returned tag and a confidence
  39. // score.
  40. type Matcher interface {
  41. Match(t ...Tag) (tag Tag, index int, c Confidence)
  42. }
  43. // Comprehends reports the confidence score for a speaker of a given language
  44. // to being able to comprehend the written form of an alternative language.
  45. func Comprehends(speaker, alternative Tag) Confidence {
  46. _, _, c := NewMatcher([]Tag{alternative}).Match(speaker)
  47. return c
  48. }
  49. // NewMatcher returns a Matcher that matches an ordered list of preferred tags
  50. // against a list of supported tags based on written intelligibility, closeness
  51. // of dialect, equivalence of subtags and various other rules. It is initialized
  52. // with the list of supported tags. The first element is used as the default
  53. // value in case no match is found.
  54. //
  55. // Its Match method matches the first of the given Tags to reach a certain
  56. // confidence threshold. The tags passed to Match should therefore be specified
  57. // in order of preference. Extensions are ignored for matching.
  58. //
  59. // The index returned by the Match method corresponds to the index of the
  60. // matched tag in t, but is augmented with the Unicode extension ('u')of the
  61. // corresponding preferred tag. This allows user locale options to be passed
  62. // transparently.
  63. func NewMatcher(t []Tag, options ...MatchOption) Matcher {
  64. return newMatcher(t, options)
  65. }
  66. func (m *matcher) Match(want ...Tag) (t Tag, index int, c Confidence) {
  67. match, w, c := m.getBest(want...)
  68. if match != nil {
  69. t, index = match.tag, match.index
  70. } else {
  71. // TODO: this should be an option
  72. t = m.default_.tag
  73. if m.preferSameScript {
  74. outer:
  75. for _, w := range want {
  76. script, _ := w.Script()
  77. if script.scriptID == 0 {
  78. // Don't do anything if there is no script, such as with
  79. // private subtags.
  80. continue
  81. }
  82. for i, h := range m.supported {
  83. if script.scriptID == h.maxScript {
  84. t, index = h.tag, i
  85. break outer
  86. }
  87. }
  88. }
  89. }
  90. // TODO: select first language tag based on script.
  91. }
  92. if w.region != 0 && t.region != 0 && t.region.contains(w.region) {
  93. t, _ = Raw.Compose(t, Region{w.region})
  94. }
  95. // Copy options from the user-provided tag into the result tag. This is hard
  96. // to do after the fact, so we do it here.
  97. // TODO: add in alternative variants to -u-va-.
  98. // TODO: add preferred region to -u-rg-.
  99. if e := w.Extensions(); len(e) > 0 {
  100. t, _ = Raw.Compose(t, e)
  101. }
  102. return t, index, c
  103. }
  104. type scriptRegionFlags uint8
  105. const (
  106. isList = 1 << iota
  107. scriptInFrom
  108. regionInFrom
  109. )
  110. func (t *Tag) setUndefinedLang(id langID) {
  111. if t.lang == 0 {
  112. t.lang = id
  113. }
  114. }
  115. func (t *Tag) setUndefinedScript(id scriptID) {
  116. if t.script == 0 {
  117. t.script = id
  118. }
  119. }
  120. func (t *Tag) setUndefinedRegion(id regionID) {
  121. if t.region == 0 || t.region.contains(id) {
  122. t.region = id
  123. }
  124. }
  125. // ErrMissingLikelyTagsData indicates no information was available
  126. // to compute likely values of missing tags.
  127. var ErrMissingLikelyTagsData = errors.New("missing likely tags data")
  128. // addLikelySubtags sets subtags to their most likely value, given the locale.
  129. // In most cases this means setting fields for unknown values, but in some
  130. // cases it may alter a value. It returns an ErrMissingLikelyTagsData error
  131. // if the given locale cannot be expanded.
  132. func (t Tag) addLikelySubtags() (Tag, error) {
  133. id, err := addTags(t)
  134. if err != nil {
  135. return t, err
  136. } else if id.equalTags(t) {
  137. return t, nil
  138. }
  139. id.remakeString()
  140. return id, nil
  141. }
  142. // specializeRegion attempts to specialize a group region.
  143. func specializeRegion(t *Tag) bool {
  144. if i := regionInclusion[t.region]; i < nRegionGroups {
  145. x := likelyRegionGroup[i]
  146. if langID(x.lang) == t.lang && scriptID(x.script) == t.script {
  147. t.region = regionID(x.region)
  148. }
  149. return true
  150. }
  151. return false
  152. }
  153. func addTags(t Tag) (Tag, error) {
  154. // We leave private use identifiers alone.
  155. if t.private() {
  156. return t, nil
  157. }
  158. if t.script != 0 && t.region != 0 {
  159. if t.lang != 0 {
  160. // already fully specified
  161. specializeRegion(&t)
  162. return t, nil
  163. }
  164. // Search matches for und-script-region. Note that for these cases
  165. // region will never be a group so there is no need to check for this.
  166. list := likelyRegion[t.region : t.region+1]
  167. if x := list[0]; x.flags&isList != 0 {
  168. list = likelyRegionList[x.lang : x.lang+uint16(x.script)]
  169. }
  170. for _, x := range list {
  171. // Deviating from the spec. See match_test.go for details.
  172. if scriptID(x.script) == t.script {
  173. t.setUndefinedLang(langID(x.lang))
  174. return t, nil
  175. }
  176. }
  177. }
  178. if t.lang != 0 {
  179. // Search matches for lang-script and lang-region, where lang != und.
  180. if t.lang < langNoIndexOffset {
  181. x := likelyLang[t.lang]
  182. if x.flags&isList != 0 {
  183. list := likelyLangList[x.region : x.region+uint16(x.script)]
  184. if t.script != 0 {
  185. for _, x := range list {
  186. if scriptID(x.script) == t.script && x.flags&scriptInFrom != 0 {
  187. t.setUndefinedRegion(regionID(x.region))
  188. return t, nil
  189. }
  190. }
  191. } else if t.region != 0 {
  192. count := 0
  193. goodScript := true
  194. tt := t
  195. for _, x := range list {
  196. // We visit all entries for which the script was not
  197. // defined, including the ones where the region was not
  198. // defined. This allows for proper disambiguation within
  199. // regions.
  200. if x.flags&scriptInFrom == 0 && t.region.contains(regionID(x.region)) {
  201. tt.region = regionID(x.region)
  202. tt.setUndefinedScript(scriptID(x.script))
  203. goodScript = goodScript && tt.script == scriptID(x.script)
  204. count++
  205. }
  206. }
  207. if count == 1 {
  208. return tt, nil
  209. }
  210. // Even if we fail to find a unique Region, we might have
  211. // an unambiguous script.
  212. if goodScript {
  213. t.script = tt.script
  214. }
  215. }
  216. }
  217. }
  218. } else {
  219. // Search matches for und-script.
  220. if t.script != 0 {
  221. x := likelyScript[t.script]
  222. if x.region != 0 {
  223. t.setUndefinedRegion(regionID(x.region))
  224. t.setUndefinedLang(langID(x.lang))
  225. return t, nil
  226. }
  227. }
  228. // Search matches for und-region. If und-script-region exists, it would
  229. // have been found earlier.
  230. if t.region != 0 {
  231. if i := regionInclusion[t.region]; i < nRegionGroups {
  232. x := likelyRegionGroup[i]
  233. if x.region != 0 {
  234. t.setUndefinedLang(langID(x.lang))
  235. t.setUndefinedScript(scriptID(x.script))
  236. t.region = regionID(x.region)
  237. }
  238. } else {
  239. x := likelyRegion[t.region]
  240. if x.flags&isList != 0 {
  241. x = likelyRegionList[x.lang]
  242. }
  243. if x.script != 0 && x.flags != scriptInFrom {
  244. t.setUndefinedLang(langID(x.lang))
  245. t.setUndefinedScript(scriptID(x.script))
  246. return t, nil
  247. }
  248. }
  249. }
  250. }
  251. // Search matches for lang.
  252. if t.lang < langNoIndexOffset {
  253. x := likelyLang[t.lang]
  254. if x.flags&isList != 0 {
  255. x = likelyLangList[x.region]
  256. }
  257. if x.region != 0 {
  258. t.setUndefinedScript(scriptID(x.script))
  259. t.setUndefinedRegion(regionID(x.region))
  260. }
  261. specializeRegion(&t)
  262. if t.lang == 0 {
  263. t.lang = _en // default language
  264. }
  265. return t, nil
  266. }
  267. return t, ErrMissingLikelyTagsData
  268. }
  269. func (t *Tag) setTagsFrom(id Tag) {
  270. t.lang = id.lang
  271. t.script = id.script
  272. t.region = id.region
  273. }
  274. // minimize removes the region or script subtags from t such that
  275. // t.addLikelySubtags() == t.minimize().addLikelySubtags().
  276. func (t Tag) minimize() (Tag, error) {
  277. t, err := minimizeTags(t)
  278. if err != nil {
  279. return t, err
  280. }
  281. t.remakeString()
  282. return t, nil
  283. }
  284. // minimizeTags mimics the behavior of the ICU 51 C implementation.
  285. func minimizeTags(t Tag) (Tag, error) {
  286. if t.equalTags(und) {
  287. return t, nil
  288. }
  289. max, err := addTags(t)
  290. if err != nil {
  291. return t, err
  292. }
  293. for _, id := range [...]Tag{
  294. {lang: t.lang},
  295. {lang: t.lang, region: t.region},
  296. {lang: t.lang, script: t.script},
  297. } {
  298. if x, err := addTags(id); err == nil && max.equalTags(x) {
  299. t.setTagsFrom(id)
  300. break
  301. }
  302. }
  303. return t, nil
  304. }
  305. // Tag Matching
  306. // CLDR defines an algorithm for finding the best match between two sets of language
  307. // tags. The basic algorithm defines how to score a possible match and then find
  308. // the match with the best score
  309. // (see http://www.unicode.org/reports/tr35/#LanguageMatching).
  310. // Using scoring has several disadvantages. The scoring obfuscates the importance of
  311. // the various factors considered, making the algorithm harder to understand. Using
  312. // scoring also requires the full score to be computed for each pair of tags.
  313. //
  314. // We will use a different algorithm which aims to have the following properties:
  315. // - clarity on the precedence of the various selection factors, and
  316. // - improved performance by allowing early termination of a comparison.
  317. //
  318. // Matching algorithm (overview)
  319. // Input:
  320. // - supported: a set of supported tags
  321. // - default: the default tag to return in case there is no match
  322. // - desired: list of desired tags, ordered by preference, starting with
  323. // the most-preferred.
  324. //
  325. // Algorithm:
  326. // 1) Set the best match to the lowest confidence level
  327. // 2) For each tag in "desired":
  328. // a) For each tag in "supported":
  329. // 1) compute the match between the two tags.
  330. // 2) if the match is better than the previous best match, replace it
  331. // with the new match. (see next section)
  332. // b) if the current best match is Exact and pin is true the result will be
  333. // frozen to the language found thusfar, although better matches may
  334. // still be found for the same language.
  335. // 3) If the best match so far is below a certain threshold, return "default".
  336. //
  337. // Ranking:
  338. // We use two phases to determine whether one pair of tags are a better match
  339. // than another pair of tags. First, we determine a rough confidence level. If the
  340. // levels are different, the one with the highest confidence wins.
  341. // Second, if the rough confidence levels are identical, we use a set of tie-breaker
  342. // rules.
  343. //
  344. // The confidence level of matching a pair of tags is determined by finding the
  345. // lowest confidence level of any matches of the corresponding subtags (the
  346. // result is deemed as good as its weakest link).
  347. // We define the following levels:
  348. // Exact - An exact match of a subtag, before adding likely subtags.
  349. // MaxExact - An exact match of a subtag, after adding likely subtags.
  350. // [See Note 2].
  351. // High - High level of mutual intelligibility between different subtag
  352. // variants.
  353. // Low - Low level of mutual intelligibility between different subtag
  354. // variants.
  355. // No - No mutual intelligibility.
  356. //
  357. // The following levels can occur for each type of subtag:
  358. // Base: Exact, MaxExact, High, Low, No
  359. // Script: Exact, MaxExact [see Note 3], Low, No
  360. // Region: Exact, MaxExact, High
  361. // Variant: Exact, High
  362. // Private: Exact, No
  363. //
  364. // Any result with a confidence level of Low or higher is deemed a possible match.
  365. // Once a desired tag matches any of the supported tags with a level of MaxExact
  366. // or higher, the next desired tag is not considered (see Step 2.b).
  367. // Note that CLDR provides languageMatching data that defines close equivalence
  368. // classes for base languages, scripts and regions.
  369. //
  370. // Tie-breaking
  371. // If we get the same confidence level for two matches, we apply a sequence of
  372. // tie-breaking rules. The first that succeeds defines the result. The rules are
  373. // applied in the following order.
  374. // 1) Original language was defined and was identical.
  375. // 2) Original region was defined and was identical.
  376. // 3) Distance between two maximized regions was the smallest.
  377. // 4) Original script was defined and was identical.
  378. // 5) Distance from want tag to have tag using the parent relation [see Note 5.]
  379. // If there is still no winner after these rules are applied, the first match
  380. // found wins.
  381. //
  382. // Notes:
  383. // [2] In practice, as matching of Exact is done in a separate phase from
  384. // matching the other levels, we reuse the Exact level to mean MaxExact in
  385. // the second phase. As a consequence, we only need the levels defined by
  386. // the Confidence type. The MaxExact confidence level is mapped to High in
  387. // the public API.
  388. // [3] We do not differentiate between maximized script values that were derived
  389. // from suppressScript versus most likely tag data. We determined that in
  390. // ranking the two, one ranks just after the other. Moreover, the two cannot
  391. // occur concurrently. As a consequence, they are identical for practical
  392. // purposes.
  393. // [4] In case of deprecated, macro-equivalents and legacy mappings, we assign
  394. // the MaxExact level to allow iw vs he to still be a closer match than
  395. // en-AU vs en-US, for example.
  396. // [5] In CLDR a locale inherits fields that are unspecified for this locale
  397. // from its parent. Therefore, if a locale is a parent of another locale,
  398. // it is a strong measure for closeness, especially when no other tie
  399. // breaker rule applies. One could also argue it is inconsistent, for
  400. // example, when pt-AO matches pt (which CLDR equates with pt-BR), even
  401. // though its parent is pt-PT according to the inheritance rules.
  402. //
  403. // Implementation Details:
  404. // There are several performance considerations worth pointing out. Most notably,
  405. // we preprocess as much as possible (within reason) at the time of creation of a
  406. // matcher. This includes:
  407. // - creating a per-language map, which includes data for the raw base language
  408. // and its canonicalized variant (if applicable),
  409. // - expanding entries for the equivalence classes defined in CLDR's
  410. // languageMatch data.
  411. // The per-language map ensures that typically only a very small number of tags
  412. // need to be considered. The pre-expansion of canonicalized subtags and
  413. // equivalence classes reduces the amount of map lookups that need to be done at
  414. // runtime.
  415. // matcher keeps a set of supported language tags, indexed by language.
  416. type matcher struct {
  417. default_ *haveTag
  418. supported []*haveTag
  419. index map[langID]*matchHeader
  420. passSettings bool
  421. preferSameScript bool
  422. }
  423. // matchHeader has the lists of tags for exact matches and matches based on
  424. // maximized and canonicalized tags for a given language.
  425. type matchHeader struct {
  426. haveTags []*haveTag
  427. original bool
  428. }
  429. // haveTag holds a supported Tag and its maximized script and region. The maximized
  430. // or canonicalized language is not stored as it is not needed during matching.
  431. type haveTag struct {
  432. tag Tag
  433. // index of this tag in the original list of supported tags.
  434. index int
  435. // conf is the maximum confidence that can result from matching this haveTag.
  436. // When conf < Exact this means it was inserted after applying a CLDR equivalence rule.
  437. conf Confidence
  438. // Maximized region and script.
  439. maxRegion regionID
  440. maxScript scriptID
  441. // altScript may be checked as an alternative match to maxScript. If altScript
  442. // matches, the confidence level for this match is Low. Theoretically there
  443. // could be multiple alternative scripts. This does not occur in practice.
  444. altScript scriptID
  445. // nextMax is the index of the next haveTag with the same maximized tags.
  446. nextMax uint16
  447. }
  448. func makeHaveTag(tag Tag, index int) (haveTag, langID) {
  449. max := tag
  450. if tag.lang != 0 || tag.region != 0 || tag.script != 0 {
  451. max, _ = max.canonicalize(All)
  452. max, _ = addTags(max)
  453. max.remakeString()
  454. }
  455. return haveTag{tag, index, Exact, max.region, max.script, altScript(max.lang, max.script), 0}, max.lang
  456. }
  457. // altScript returns an alternative script that may match the given script with
  458. // a low confidence. At the moment, the langMatch data allows for at most one
  459. // script to map to another and we rely on this to keep the code simple.
  460. func altScript(l langID, s scriptID) scriptID {
  461. for _, alt := range matchScript {
  462. // TODO: also match cases where language is not the same.
  463. if (langID(alt.wantLang) == l || langID(alt.haveLang) == l) &&
  464. scriptID(alt.haveScript) == s {
  465. return scriptID(alt.wantScript)
  466. }
  467. }
  468. return 0
  469. }
  470. // addIfNew adds a haveTag to the list of tags only if it is a unique tag.
  471. // Tags that have the same maximized values are linked by index.
  472. func (h *matchHeader) addIfNew(n haveTag, exact bool) {
  473. h.original = h.original || exact
  474. // Don't add new exact matches.
  475. for _, v := range h.haveTags {
  476. if v.tag.equalsRest(n.tag) {
  477. return
  478. }
  479. }
  480. // Allow duplicate maximized tags, but create a linked list to allow quickly
  481. // comparing the equivalents and bail out.
  482. for i, v := range h.haveTags {
  483. if v.maxScript == n.maxScript &&
  484. v.maxRegion == n.maxRegion &&
  485. v.tag.variantOrPrivateTagStr() == n.tag.variantOrPrivateTagStr() {
  486. for h.haveTags[i].nextMax != 0 {
  487. i = int(h.haveTags[i].nextMax)
  488. }
  489. h.haveTags[i].nextMax = uint16(len(h.haveTags))
  490. break
  491. }
  492. }
  493. h.haveTags = append(h.haveTags, &n)
  494. }
  495. // header returns the matchHeader for the given language. It creates one if
  496. // it doesn't already exist.
  497. func (m *matcher) header(l langID) *matchHeader {
  498. if h := m.index[l]; h != nil {
  499. return h
  500. }
  501. h := &matchHeader{}
  502. m.index[l] = h
  503. return h
  504. }
  505. func toConf(d uint8) Confidence {
  506. if d <= 10 {
  507. return High
  508. }
  509. if d < 30 {
  510. return Low
  511. }
  512. return No
  513. }
  514. // newMatcher builds an index for the given supported tags and returns it as
  515. // a matcher. It also expands the index by considering various equivalence classes
  516. // for a given tag.
  517. func newMatcher(supported []Tag, options []MatchOption) *matcher {
  518. m := &matcher{
  519. index: make(map[langID]*matchHeader),
  520. preferSameScript: true,
  521. }
  522. for _, o := range options {
  523. o(m)
  524. }
  525. if len(supported) == 0 {
  526. m.default_ = &haveTag{}
  527. return m
  528. }
  529. // Add supported languages to the index. Add exact matches first to give
  530. // them precedence.
  531. for i, tag := range supported {
  532. pair, _ := makeHaveTag(tag, i)
  533. m.header(tag.lang).addIfNew(pair, true)
  534. m.supported = append(m.supported, &pair)
  535. }
  536. m.default_ = m.header(supported[0].lang).haveTags[0]
  537. // Keep these in two different loops to support the case that two equivalent
  538. // languages are distinguished, such as iw and he.
  539. for i, tag := range supported {
  540. pair, max := makeHaveTag(tag, i)
  541. if max != tag.lang {
  542. m.header(max).addIfNew(pair, true)
  543. }
  544. }
  545. // update is used to add indexes in the map for equivalent languages.
  546. // update will only add entries to original indexes, thus not computing any
  547. // transitive relations.
  548. update := func(want, have uint16, conf Confidence) {
  549. if hh := m.index[langID(have)]; hh != nil {
  550. if !hh.original {
  551. return
  552. }
  553. hw := m.header(langID(want))
  554. for _, ht := range hh.haveTags {
  555. v := *ht
  556. if conf < v.conf {
  557. v.conf = conf
  558. }
  559. v.nextMax = 0 // this value needs to be recomputed
  560. if v.altScript != 0 {
  561. v.altScript = altScript(langID(want), v.maxScript)
  562. }
  563. hw.addIfNew(v, conf == Exact && hh.original)
  564. }
  565. }
  566. }
  567. // Add entries for languages with mutual intelligibility as defined by CLDR's
  568. // languageMatch data.
  569. for _, ml := range matchLang {
  570. update(ml.want, ml.have, toConf(ml.distance))
  571. if !ml.oneway {
  572. update(ml.have, ml.want, toConf(ml.distance))
  573. }
  574. }
  575. // Add entries for possible canonicalizations. This is an optimization to
  576. // ensure that only one map lookup needs to be done at runtime per desired tag.
  577. // First we match deprecated equivalents. If they are perfect equivalents
  578. // (their canonicalization simply substitutes a different language code, but
  579. // nothing else), the match confidence is Exact, otherwise it is High.
  580. for i, lm := range langAliasMap {
  581. // If deprecated codes match and there is no fiddling with the script or
  582. // or region, we consider it an exact match.
  583. conf := Exact
  584. if langAliasTypes[i] != langMacro {
  585. if !isExactEquivalent(langID(lm.from)) {
  586. conf = High
  587. }
  588. update(lm.to, lm.from, conf)
  589. }
  590. update(lm.from, lm.to, conf)
  591. }
  592. return m
  593. }
  594. // getBest gets the best matching tag in m for any of the given tags, taking into
  595. // account the order of preference of the given tags.
  596. func (m *matcher) getBest(want ...Tag) (got *haveTag, orig Tag, c Confidence) {
  597. best := bestMatch{}
  598. for i, w := range want {
  599. var max Tag
  600. // Check for exact match first.
  601. h := m.index[w.lang]
  602. if w.lang != 0 {
  603. if h == nil {
  604. continue
  605. }
  606. // Base language is defined.
  607. max, _ = w.canonicalize(Legacy | Deprecated | Macro)
  608. // A region that is added through canonicalization is stronger than
  609. // a maximized region: set it in the original (e.g. mo -> ro-MD).
  610. if w.region != max.region {
  611. w.region = max.region
  612. }
  613. // TODO: should we do the same for scripts?
  614. // See test case: en, sr, nl ; sh ; sr
  615. max, _ = addTags(max)
  616. } else {
  617. // Base language is not defined.
  618. if h != nil {
  619. for i := range h.haveTags {
  620. have := h.haveTags[i]
  621. if have.tag.equalsRest(w) {
  622. return have, w, Exact
  623. }
  624. }
  625. }
  626. if w.script == 0 && w.region == 0 {
  627. // We skip all tags matching und for approximate matching, including
  628. // private tags.
  629. continue
  630. }
  631. max, _ = addTags(w)
  632. if h = m.index[max.lang]; h == nil {
  633. continue
  634. }
  635. }
  636. pin := true
  637. for _, t := range want[i+1:] {
  638. if w.lang == t.lang {
  639. pin = false
  640. break
  641. }
  642. }
  643. // Check for match based on maximized tag.
  644. for i := range h.haveTags {
  645. have := h.haveTags[i]
  646. best.update(have, w, max.script, max.region, pin)
  647. if best.conf == Exact {
  648. for have.nextMax != 0 {
  649. have = h.haveTags[have.nextMax]
  650. best.update(have, w, max.script, max.region, pin)
  651. }
  652. return best.have, best.want, best.conf
  653. }
  654. }
  655. }
  656. if best.conf <= No {
  657. if len(want) != 0 {
  658. return nil, want[0], No
  659. }
  660. return nil, Tag{}, No
  661. }
  662. return best.have, best.want, best.conf
  663. }
  664. // bestMatch accumulates the best match so far.
  665. type bestMatch struct {
  666. have *haveTag
  667. want Tag
  668. conf Confidence
  669. pinnedRegion regionID
  670. pinLanguage bool
  671. sameRegionGroup bool
  672. // Cached results from applying tie-breaking rules.
  673. origLang bool
  674. origReg bool
  675. paradigmReg bool
  676. regGroupDist uint8
  677. origScript bool
  678. }
  679. // update updates the existing best match if the new pair is considered to be a
  680. // better match. To determine if the given pair is a better match, it first
  681. // computes the rough confidence level. If this surpasses the current match, it
  682. // will replace it and update the tie-breaker rule cache. If there is a tie, it
  683. // proceeds with applying a series of tie-breaker rules. If there is no
  684. // conclusive winner after applying the tie-breaker rules, it leaves the current
  685. // match as the preferred match.
  686. //
  687. // If pin is true and have and tag are a strong match, it will henceforth only
  688. // consider matches for this language. This corresponds to the nothing that most
  689. // users have a strong preference for the first defined language. A user can
  690. // still prefer a second language over a dialect of the preferred language by
  691. // explicitly specifying dialects, e.g. "en, nl, en-GB". In this case pin should
  692. // be false.
  693. func (m *bestMatch) update(have *haveTag, tag Tag, maxScript scriptID, maxRegion regionID, pin bool) {
  694. // Bail if the maximum attainable confidence is below that of the current best match.
  695. c := have.conf
  696. if c < m.conf {
  697. return
  698. }
  699. // Don't change the language once we already have found an exact match.
  700. if m.pinLanguage && tag.lang != m.want.lang {
  701. return
  702. }
  703. // Pin the region group if we are comparing tags for the same language.
  704. if tag.lang == m.want.lang && m.sameRegionGroup {
  705. _, sameGroup := regionGroupDist(m.pinnedRegion, have.maxRegion, have.maxScript, m.want.lang)
  706. if !sameGroup {
  707. return
  708. }
  709. }
  710. if c == Exact && have.maxScript == maxScript {
  711. // If there is another language and then another entry of this language,
  712. // don't pin anything, otherwise pin the language.
  713. m.pinLanguage = pin
  714. }
  715. if have.tag.equalsRest(tag) {
  716. } else if have.maxScript != maxScript {
  717. // There is usually very little comprehension between different scripts.
  718. // In a few cases there may still be Low comprehension. This possibility
  719. // is pre-computed and stored in have.altScript.
  720. if Low < m.conf || have.altScript != maxScript {
  721. return
  722. }
  723. c = Low
  724. } else if have.maxRegion != maxRegion {
  725. if High < c {
  726. // There is usually a small difference between languages across regions.
  727. c = High
  728. }
  729. }
  730. // We store the results of the computations of the tie-breaker rules along
  731. // with the best match. There is no need to do the checks once we determine
  732. // we have a winner, but we do still need to do the tie-breaker computations.
  733. // We use "beaten" to keep track if we still need to do the checks.
  734. beaten := false // true if the new pair defeats the current one.
  735. if c != m.conf {
  736. if c < m.conf {
  737. return
  738. }
  739. beaten = true
  740. }
  741. // Tie-breaker rules:
  742. // We prefer if the pre-maximized language was specified and identical.
  743. origLang := have.tag.lang == tag.lang && tag.lang != 0
  744. if !beaten && m.origLang != origLang {
  745. if m.origLang {
  746. return
  747. }
  748. beaten = true
  749. }
  750. // We prefer if the pre-maximized region was specified and identical.
  751. origReg := have.tag.region == tag.region && tag.region != 0
  752. if !beaten && m.origReg != origReg {
  753. if m.origReg {
  754. return
  755. }
  756. beaten = true
  757. }
  758. regGroupDist, sameGroup := regionGroupDist(have.maxRegion, maxRegion, maxScript, tag.lang)
  759. if !beaten && m.regGroupDist != regGroupDist {
  760. if regGroupDist > m.regGroupDist {
  761. return
  762. }
  763. beaten = true
  764. }
  765. paradigmReg := isParadigmLocale(tag.lang, have.maxRegion)
  766. if !beaten && m.paradigmReg != paradigmReg {
  767. if !paradigmReg {
  768. return
  769. }
  770. beaten = true
  771. }
  772. // Next we prefer if the pre-maximized script was specified and identical.
  773. origScript := have.tag.script == tag.script && tag.script != 0
  774. if !beaten && m.origScript != origScript {
  775. if m.origScript {
  776. return
  777. }
  778. beaten = true
  779. }
  780. // Update m to the newly found best match.
  781. if beaten {
  782. m.have = have
  783. m.want = tag
  784. m.conf = c
  785. m.pinnedRegion = maxRegion
  786. m.sameRegionGroup = sameGroup
  787. m.origLang = origLang
  788. m.origReg = origReg
  789. m.paradigmReg = paradigmReg
  790. m.origScript = origScript
  791. m.regGroupDist = regGroupDist
  792. }
  793. }
  794. func isParadigmLocale(lang langID, r regionID) bool {
  795. for _, e := range paradigmLocales {
  796. if langID(e[0]) == lang && (r == regionID(e[1]) || r == regionID(e[2])) {
  797. return true
  798. }
  799. }
  800. return false
  801. }
  802. // regionGroupDist computes the distance between two regions based on their
  803. // CLDR grouping.
  804. func regionGroupDist(a, b regionID, script scriptID, lang langID) (dist uint8, same bool) {
  805. const defaultDistance = 4
  806. aGroup := uint(regionToGroups[a]) << 1
  807. bGroup := uint(regionToGroups[b]) << 1
  808. for _, ri := range matchRegion {
  809. if langID(ri.lang) == lang && (ri.script == 0 || scriptID(ri.script) == script) {
  810. group := uint(1 << (ri.group &^ 0x80))
  811. if 0x80&ri.group == 0 {
  812. if aGroup&bGroup&group != 0 { // Both regions are in the group.
  813. return ri.distance, ri.distance == defaultDistance
  814. }
  815. } else {
  816. if (aGroup|bGroup)&group == 0 { // Both regions are not in the group.
  817. return ri.distance, ri.distance == defaultDistance
  818. }
  819. }
  820. }
  821. }
  822. return defaultDistance, true
  823. }
  824. func (t Tag) variants() string {
  825. if t.pVariant == 0 {
  826. return ""
  827. }
  828. return t.str[t.pVariant:t.pExt]
  829. }
  830. // variantOrPrivateTagStr returns variants or private use tags.
  831. func (t Tag) variantOrPrivateTagStr() string {
  832. if t.pExt > 0 {
  833. return t.str[t.pVariant:t.pExt]
  834. }
  835. return t.str[t.pVariant:]
  836. }
  837. // equalsRest compares everything except the language.
  838. func (a Tag) equalsRest(b Tag) bool {
  839. // TODO: don't include extensions in this comparison. To do this efficiently,
  840. // though, we should handle private tags separately.
  841. return a.script == b.script && a.region == b.region && a.variantOrPrivateTagStr() == b.variantOrPrivateTagStr()
  842. }
  843. // isExactEquivalent returns true if canonicalizing the language will not alter
  844. // the script or region of a tag.
  845. func isExactEquivalent(l langID) bool {
  846. for _, o := range notEquivalent {
  847. if o == l {
  848. return false
  849. }
  850. }
  851. return true
  852. }
  853. var notEquivalent []langID
  854. func init() {
  855. // Create a list of all languages for which canonicalization may alter the
  856. // script or region.
  857. for _, lm := range langAliasMap {
  858. tag := Tag{lang: langID(lm.from)}
  859. if tag, _ = tag.canonicalize(All); tag.script != 0 || tag.region != 0 {
  860. notEquivalent = append(notEquivalent, langID(lm.from))
  861. }
  862. }
  863. // Maximize undefined regions of paradigm locales.
  864. for i, v := range paradigmLocales {
  865. max, _ := addTags(Tag{lang: langID(v[0])})
  866. if v[1] == 0 {
  867. paradigmLocales[i][1] = uint16(max.region)
  868. }
  869. if v[2] == 0 {
  870. paradigmLocales[i][2] = uint16(max.region)
  871. }
  872. }
  873. }