parent
3cb092abb9
commit
31f64a2799
@ -0,0 +1,23 @@ |
|||||||
|
import logging |
||||||
|
|
||||||
|
from discord.ext import commands |
||||||
|
|
||||||
|
from miyu_bot.commands.common.fuzzy_matching import romanize, FuzzyMatcher |
||||||
|
|
||||||
|
|
||||||
|
class Utility(commands.Cog): |
||||||
|
def __init__(self, bot: commands.Bot): |
||||||
|
self.bot = bot |
||||||
|
self.logger = logging.getLogger(__name__) |
||||||
|
|
||||||
|
@commands.command(hidden=True) |
||||||
|
async def romanize(self, ctx: commands.Context, *, arg: str): |
||||||
|
await ctx.send(romanize(arg)) |
||||||
|
|
||||||
|
@commands.command(hidden=True, ignore_extra=False) |
||||||
|
async def similarity_score(self, ctx: commands.Context, source: str, target: str): |
||||||
|
await ctx.send(str(FuzzyMatcher().score(romanize(source), romanize(target)))) |
||||||
|
|
||||||
|
|
||||||
|
def setup(bot): |
||||||
|
bot.add_cog(Utility(bot)) |
@ -1,76 +1,118 @@ |
|||||||
import logging |
import logging |
||||||
import re |
import re |
||||||
from typing import Tuple |
from dataclasses import dataclass, field |
||||||
|
from typing import Dict, Tuple, List |
||||||
|
|
||||||
import pykakasi |
import pykakasi |
||||||
|
|
||||||
|
|
||||||
class FuzzyMatcher: |
class FuzzyMap: |
||||||
def __init__(self, filter, threshold: float = 1): |
def __init__(self, filter=lambda: True, matcher=None): |
||||||
self.filter = filter or (lambda n: True) |
self.filter = filter or (lambda n: True) |
||||||
self.threshold = threshold |
self.matcher = matcher or FuzzyMatcher() |
||||||
self.values = {} |
self._values = {} |
||||||
self.max_length = 0 |
self.max_length = 0 |
||||||
self.logger = logging.getLogger(__name__) |
self.logger = logging.getLogger(__name__) |
||||||
|
|
||||||
|
def values(self): |
||||||
|
return (v for v in self._values.values() if self.filter(v)) |
||||||
|
|
||||||
|
def __delitem__(self, key): |
||||||
|
k = romanize(key) |
||||||
|
self._values.__delitem__(k) |
||||||
|
|
||||||
def __setitem__(self, key, value): |
def __setitem__(self, key, value): |
||||||
k = romanize(key) |
k = romanize(key) |
||||||
self.values[k] = value |
self._values[k] = value |
||||||
self.max_length = len(k[0]) |
self.max_length = len(k) |
||||||
|
|
||||||
def __getitem__(self, key): |
def __getitem__(self, key): |
||||||
if len(key) > self.max_length * 1.1: |
if len(key) > self.max_length * 1.1: |
||||||
self.logger.debug(f'Rejected key "{key}" due to length.') |
self.logger.debug(f'Rejected key "{key}" due to length.') |
||||||
return None |
return None |
||||||
key, _ = romanize(key) |
key = romanize(key) |
||||||
result = min((k for k, v in self.values.items() if self.filter(v)), |
result = min((k for k, v in self._values.items() if self.filter(v)), key=lambda k: self.matcher.score(key, k)) |
||||||
key=lambda v: fuzzy_match_score(key, *v, threshold=self.threshold)) |
if self.matcher.score(key, result) > 0: |
||||||
if fuzzy_match_score(key, *result, threshold=self.threshold) > self.threshold: |
|
||||||
return None |
return None |
||||||
return self.values[result] |
return self._values[result] |
||||||
|
|
||||||
|
|
||||||
|
@dataclass |
||||||
|
class FuzzyMatchConfig: |
||||||
|
base_score: float = 0.0 |
||||||
|
insertion_weight: float = 0.001 |
||||||
|
deletion_weight: float = 1.0 |
||||||
|
default_substitution_weight: float = 1.0 |
||||||
|
match_weight: float = -0.2 |
||||||
|
special_substitution_weights: Dict[Tuple[str, str], float] = field(default_factory=lambda: { |
||||||
|
('v', 'b'): 0.0, |
||||||
|
('l', 'r'): 0.0, |
||||||
|
}) |
||||||
|
word_match_weight: float = -0.2 |
||||||
|
acronym_match_weight: float = -0.3 |
||||||
|
|
||||||
_insertion_weight = 0.001 |
|
||||||
_deletion_weight = 1 |
|
||||||
_substitution_weight = 1 |
|
||||||
|
|
||||||
|
class FuzzyMatcher: |
||||||
|
def __init__(self, config: FuzzyMatchConfig = None): |
||||||
|
self.config = config or FuzzyMatchConfig() |
||||||
|
|
||||||
def fuzzy_match_score(source: str, target: str, words, threshold: float) -> float: |
def score(self, source: str, target: str): |
||||||
m = len(source) |
l_src = len(source) |
||||||
n = len(target) |
l_tgt = len(target) |
||||||
a = [[0] * (n + 1) for _ in range(m + 1)] |
a: List[List[float]] = [[0] * (l_tgt + 1) for _ in range(l_src + 1)] |
||||||
|
|
||||||
for i in range(m + 1): |
for i in range(l_src + 1): |
||||||
a[i][0] = i |
a[i][0] = i |
||||||
|
|
||||||
for i in range(n + 1): |
for i in range(l_tgt + 1): |
||||||
a[0][i] = i * _insertion_weight |
a[0][i] = i * self.config.insertion_weight |
||||||
|
|
||||||
def strip_vowels(s): |
def strip_vowels(s): |
||||||
return re.sub('[aeoiu]', '', s) |
return re.sub('[aeoiu]', '', s) |
||||||
|
|
||||||
word_match_bonus = 0.1 * max(max(sum(a == b for a, b in zip(source, w)) for w in words), |
words = target.split() |
||||||
max(sum(a == b for a, b in |
word_bonus = min(self.config.word_match_weight * max(sum(a == b for a, b in zip(source, w)) for w in words), |
||||||
zip(source[0] + strip_vowels(source[1:]), w[0] + strip_vowels(w[1:]))) for w in |
self.config.word_match_weight * max(sum(a == b for a, b in |
||||||
|
zip(source, w[0] + strip_vowels(w[1:]))) for w in |
||||||
words), |
words), |
||||||
sum(a == b for a, b in zip(source, ''.join(w[0] for w in words)))) |
self.config.acronym_match_weight * sum( |
||||||
|
a == b for a, b in zip(source, ''.join(w[0] for w in words)))) |
||||||
for i in range(1, m + 1): |
|
||||||
for j in range(1, n + 1): |
def sub_weight_at(n, m): |
||||||
a[i][j] = min(a[i - 1][j - 1] + _substitution_weight if source[i - 1] != target[j - 1] else a[i - 1][j - 1], |
if source[n - 1] != target[m - 1]: |
||||||
a[i - 1][j] + _deletion_weight, |
return self.config.special_substitution_weights.get( |
||||||
a[i][j - 1] + _insertion_weight) |
(source[n - 1], target[m - 1]), |
||||||
if j == n and (a[i][j] - (m - i) * _insertion_weight - word_match_bonus) > threshold: |
self.config.default_substitution_weight |
||||||
return 9999 |
) |
||||||
|
else: |
||||||
return a[m][n] - word_match_bonus |
return self.config.match_weight |
||||||
|
|
||||||
|
for i_src in range(1, l_src + 1): |
||||||
def romanize(s: str) -> Tuple[str, Tuple[str]]: |
for i_tgt in range(1, l_tgt + 1): |
||||||
|
a[i_src][i_tgt] = min(a[i_src - 1][i_tgt - 1] + sub_weight_at(i_src, i_tgt), |
||||||
|
a[i_src - 1][i_tgt] + self.config.deletion_weight, |
||||||
|
a[i_src][i_tgt - 1] + self.config.insertion_weight) |
||||||
|
|
||||||
|
# there are l_scr - i_src source chars remaining |
||||||
|
# each match removes the insertion weight then adds the match weight |
||||||
|
# (l_src - i_src) * (self.config.match_weight - self.config.insertion_weight) |
||||||
|
# is the max difference that can make |
||||||
|
max_additional_score = ((l_src - i_src) * (self.config.match_weight - self.config.insertion_weight) + |
||||||
|
word_bonus + self.config.base_score) |
||||||
|
if i_tgt == l_tgt and ( |
||||||
|
a[i_src][i_tgt] + max_additional_score) > 0 and \ |
||||||
|
(a[i_src][i_tgt - 1] + max_additional_score) > 0: |
||||||
|
return 1 |
||||||
|
|
||||||
|
return a[l_src][l_tgt] + word_bonus + self.config.base_score |
||||||
|
|
||||||
|
|
||||||
|
def romanize(s: str) -> str: |
||||||
kks = pykakasi.kakasi() |
kks = pykakasi.kakasi() |
||||||
s = re.sub('[\']', '', s) |
s = re.sub('[\']', '', s) |
||||||
|
s = re.sub('[・]', ' ', s) |
||||||
s = re.sub('[A-Za-z]+', lambda ele: f' {ele[0]} ', s) |
s = re.sub('[A-Za-z]+', lambda ele: f' {ele[0]} ', s) |
||||||
|
s = re.sub('[0-9]+', lambda ele: f' {ele[0]} ', s) |
||||||
s = ' '.join(c['hepburn'].strip().lower() for c in kks.convert(s)) |
s = ' '.join(c['hepburn'].strip().lower() for c in kks.convert(s)) |
||||||
s = re.sub(r'[^a-zA-Z0-9_ ]+', '', s) |
s = re.sub(r'[^a-zA-Z0-9_ ]+', '', s) |
||||||
words = tuple(s.split()) |
return ' '.join(s.split()) |
||||||
return ''.join(words), words |
|
||||||
|
Loading…
Reference in new issue