Utility bot for rhythm game D4DJ. (Note that some dependencies are not public)
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
miyu-bot/miyu_bot/commands/common/fuzzy_matching.py

118 lines
4.5 KiB

import logging
import re
from dataclasses import dataclass, field
from typing import Dict, Tuple, List
import pykakasi
class FuzzyMap:
def __init__(self, filter=lambda: True, matcher=None):
self.filter = filter or (lambda n: True)
self.matcher = matcher or FuzzyMatcher()
self._values = {}
self.max_length = 0
self.logger = logging.getLogger(__name__)
def values(self):
return (v for v in self._values.values() if self.filter(v))
def __delitem__(self, key):
k = romanize(key)
self._values.__delitem__(k)
def __setitem__(self, key, value):
k = romanize(key)
self._values[k] = value
self.max_length = len(k)
def __getitem__(self, key):
if len(key) > self.max_length * 1.1:
self.logger.debug(f'Rejected key "{key}" due to length.')
return None
key = romanize(key)
result = min((k for k, v in self._values.items() if self.filter(v)), key=lambda k: self.matcher.score(key, k))
if self.matcher.score(key, result) > 0:
return None
return self._values[result]
@dataclass
class FuzzyMatchConfig:
base_score: float = 0.0
insertion_weight: float = 0.001
deletion_weight: float = 1.0
default_substitution_weight: float = 1.0
match_weight: float = -0.2
special_substitution_weights: Dict[Tuple[str, str], float] = field(default_factory=lambda: {
('v', 'b'): 0.0,
('l', 'r'): 0.0,
})
word_match_weight: float = -0.2
acronym_match_weight: float = -0.3
class FuzzyMatcher:
def __init__(self, config: FuzzyMatchConfig = None):
self.config = config or FuzzyMatchConfig()
def score(self, source: str, target: str):
l_src = len(source)
l_tgt = len(target)
a: List[List[float]] = [[0] * (l_tgt + 1) for _ in range(l_src + 1)]
for i in range(l_src + 1):
a[i][0] = i
for i in range(l_tgt + 1):
a[0][i] = i * self.config.insertion_weight
def strip_vowels(s):
return re.sub('[aeoiu]', '', s)
words = target.split()
word_bonus = min(self.config.word_match_weight * max(sum(a == b for a, b in zip(source, w)) for w in words),
self.config.word_match_weight * max(sum(a == b for a, b in
zip(source, w[0] + strip_vowels(w[1:]))) for w in
words),
self.config.acronym_match_weight * sum(
a == b for a, b in zip(source, ''.join(w[0] for w in words))))
def sub_weight_at(n, m):
if source[n - 1] != target[m - 1]:
return self.config.special_substitution_weights.get(
(source[n - 1], target[m - 1]),
self.config.default_substitution_weight
)
else:
return self.config.match_weight
for i_src in range(1, l_src + 1):
for i_tgt in range(1, l_tgt + 1):
a[i_src][i_tgt] = min(a[i_src - 1][i_tgt - 1] + sub_weight_at(i_src, i_tgt),
a[i_src - 1][i_tgt] + self.config.deletion_weight,
a[i_src][i_tgt - 1] + self.config.insertion_weight)
# there are l_scr - i_src source chars remaining
# each match removes the insertion weight then adds the match weight
# (l_src - i_src) * (self.config.match_weight - self.config.insertion_weight)
# is the max difference that can make
max_additional_score = ((l_src - i_src) * (self.config.match_weight - self.config.insertion_weight) +
word_bonus + self.config.base_score)
if i_tgt == l_tgt and (
a[i_src][i_tgt] + max_additional_score) > 0 and \
(a[i_src][i_tgt - 1] + max_additional_score) > 0:
return 1
return a[l_src][l_tgt] + word_bonus + self.config.base_score
def romanize(s: str) -> str:
kks = pykakasi.kakasi()
s = re.sub('[\']', '', s)
s = re.sub('[・]', ' ', s)
s = re.sub('[A-Za-z]+', lambda ele: f' {ele[0]} ', s)
s = re.sub('[0-9]+', lambda ele: f' {ele[0]} ', s)
s = ' '.join(c['hepburn'].strip().lower() for c in kks.convert(s))
s = re.sub(r'[^a-zA-Z0-9_ ]+', '', s)
return ' '.join(s.split())