Fix the tag span generation for tags with nonascii characters (#2700)
* Update mention and tag regexes from mastodon * Normalize nonascii tag names the same way that mastodon does
This commit is contained in:
parent
687cffd540
commit
5d09a67b52
5 changed files with 44 additions and 3 deletions
|
@ -0,0 +1,26 @@
|
||||||
|
/* Copyright 2022 Tusky contributors
|
||||||
|
*
|
||||||
|
* This file is a part of Tusky.
|
||||||
|
*
|
||||||
|
* This program is free software; you can redistribute it and/or modify it under the terms of the
|
||||||
|
* GNU General Public License as published by the Free Software Foundation; either version 3 of the
|
||||||
|
* License, or (at your option) any later version.
|
||||||
|
*
|
||||||
|
* Tusky is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even
|
||||||
|
* the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General
|
||||||
|
* Public License for more details.
|
||||||
|
*
|
||||||
|
* You should have received a copy of the GNU General Public License along with Tusky; if not,
|
||||||
|
* see <http://www.gnu.org/licenses>. */
|
||||||
|
|
||||||
|
package com.keylesspalace.tusky.util
|
||||||
|
|
||||||
|
// Inspired by https://github.com/mastodon/mastodon/blob/main/app/lib/ascii_folding.rb
|
||||||
|
|
||||||
|
val unicodeToASCIIMap = "ÀÁÂÃÄÅàáâãäåĀāĂ㥹ÇçĆćĈĉĊċČčÐðĎďĐđÈÉÊËèéêëĒēĔĕĖėĘęĚěĜĝĞğĠġĢģĤĥĦħÌÍÎÏìíîïĨĩĪīĬĭĮįİıĴĵĶķĸĹĺĻļĽľĿŀŁłÑñŃńŅņŇňʼnŊŋÒÓÔÕÖØòóôõöøŌōŎŏŐőŔŕŖŗŘřŚśŜŝŞşŠšſŢţŤťŦŧÙÚÛÜùúûüŨũŪūŬŭŮůŰűŲųŴŵÝýÿŶŷŸŹźŻżŽž".toList().zip(
|
||||||
|
"AAAAAAaaaaaaAaAaAaCcCcCcCcCcDdDdDdEEEEeeeeEeEeEeEeEeGgGgGgGgHhHhIIIIiiiiIiIiIiIiIiJjKkkLlLlLlLlLlNnNnNnNnnNnOOOOOOooooooOoOoOoRrRrRrSsSsSsSssTtTtTtUUUUuuuuUuUuUuUuUuUuWwYyyYyYZzZzZz".toList()
|
||||||
|
).toMap()
|
||||||
|
|
||||||
|
fun normalizeToASCII(text: CharSequence): CharSequence {
|
||||||
|
return String(text.map { unicodeToASCIIMap[it] ?: it }.toCharArray())
|
||||||
|
}
|
|
@ -124,7 +124,7 @@ fun setClickableText(
|
||||||
|
|
||||||
@VisibleForTesting
|
@VisibleForTesting
|
||||||
fun getTagName(text: CharSequence, tags: List<HashTag>?): String? {
|
fun getTagName(text: CharSequence, tags: List<HashTag>?): String? {
|
||||||
val scrapedName = text.subSequence(1, text.length).toString()
|
val scrapedName = normalizeToASCII(text.subSequence(1, text.length)).toString()
|
||||||
return when (tags) {
|
return when (tags) {
|
||||||
null -> scrapedName
|
null -> scrapedName
|
||||||
else -> tags.firstOrNull { it.name.equals(scrapedName, true) }?.name
|
else -> tags.firstOrNull { it.name.equals(scrapedName, true) }?.name
|
||||||
|
|
|
@ -12,13 +12,16 @@ import kotlin.math.max
|
||||||
* @see <a href="https://github.com/tootsuite/mastodon/blob/master/app/models/tag.rb">
|
* @see <a href="https://github.com/tootsuite/mastodon/blob/master/app/models/tag.rb">
|
||||||
* Tag#HASHTAG_RE</a>.
|
* Tag#HASHTAG_RE</a>.
|
||||||
*/
|
*/
|
||||||
private const val TAG_REGEX = "(?:^|[^/)A-Za-z0-9_])#([\\w_]*[\\p{Alpha}_][\\w_]*)"
|
private const val HASHTAG_SEPARATORS = "_\\u00B7\\u200c"
|
||||||
|
private const val UNICODE_WORD = "\\p{L}\\p{Mn}\\p{Nd}\\p{Nl}\\p{Pc}" // Ugh, java ( https://stackoverflow.com/questions/4304928/unicode-equivalents-for-w-and-b-in-java-regular-expressions )
|
||||||
|
private const val TAG_REGEX = "(?:^|[^/)\\w])#(([${UNICODE_WORD}_][$UNICODE_WORD$HASHTAG_SEPARATORS]*[\\p{Alpha}$HASHTAG_SEPARATORS][$UNICODE_WORD$HASHTAG_SEPARATORS]*[${UNICODE_WORD}_])|([${UNICODE_WORD}_]*[\\p{Alpha}][${UNICODE_WORD}_]*))"
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* @see <a href="https://github.com/tootsuite/mastodon/blob/master/app/models/account.rb">
|
* @see <a href="https://github.com/tootsuite/mastodon/blob/master/app/models/account.rb">
|
||||||
* Account#MENTION_RE</a>
|
* Account#MENTION_RE</a>
|
||||||
*/
|
*/
|
||||||
private const val MENTION_REGEX = "(?:^|[^/[:word:]])@([a-z0-9_-]+(?:@[a-z0-9\\.\\-]+[a-z0-9]+)?)"
|
private const val USERNAME_REGEX = "[\\w]+([\\w\\.-]+[\\w]+)?"
|
||||||
|
private const val MENTION_REGEX = "(?<=^|[^\\/$UNICODE_WORD])@(($USERNAME_REGEX)(?:@[$UNICODE_WORD\\.\\-]+[$UNICODE_WORD]+)?)"
|
||||||
|
|
||||||
private const val HTTP_URL_REGEX = "(?:(^|\\b)http://[^\\s]+)"
|
private const val HTTP_URL_REGEX = "(?:(^|\\b)http://[^\\s]+)"
|
||||||
private const val HTTPS_URL_REGEX = "(?:(^|\\b)https://[^\\s]+)"
|
private const val HTTPS_URL_REGEX = "(?:(^|\\b)https://[^\\s]+)"
|
||||||
|
|
|
@ -38,6 +38,7 @@ class SpanUtilsTest {
|
||||||
return listOf(
|
return listOf(
|
||||||
"@mention",
|
"@mention",
|
||||||
"#tag",
|
"#tag",
|
||||||
|
"#tåg",
|
||||||
"https://thr.ee/meh?foo=bar&wat=@at#hmm",
|
"https://thr.ee/meh?foo=bar&wat=@at#hmm",
|
||||||
"http://thr.ee/meh?foo=bar&wat=@at#hmm"
|
"http://thr.ee/meh?foo=bar&wat=@at#hmm"
|
||||||
)
|
)
|
||||||
|
|
|
@ -86,6 +86,17 @@ class LinkHelperTest {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
fun whenCheckingTags_tagNameIsNormalized() {
|
||||||
|
val mutator = "aeiou".toList().zip("åÉîøÜ".toList()).toMap()
|
||||||
|
for (tag in tags) {
|
||||||
|
val mutatedTagName = String(tag.name.map { mutator[it] ?: it }.toCharArray())
|
||||||
|
val tagName = getTagName("#$mutatedTagName", tags)
|
||||||
|
Assert.assertNotNull(tagName)
|
||||||
|
Assert.assertNotNull(tags.firstOrNull { it.name == tagName })
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
fun hashedUrlSpans_withNoMatchingTag_areNotModified() {
|
fun hashedUrlSpans_withNoMatchingTag_areNotModified() {
|
||||||
for (tag in tags) {
|
for (tag in tags) {
|
||||||
|
|
Loading…
Reference in a new issue