|
|
|
|
|
|
|
|
import re |
|
|
import unicodedata |
|
|
import six |
|
|
|
|
|
|
|
|
class Character(object): |
|
|
f_unknown = 'unknown' |
|
|
f_is_alnum = 'is_alnum' |
|
|
f_is_alpha = 'is_alpha' |
|
|
f_is_num = 'is_num' |
|
|
f_is_space = 'is_space' |
|
|
f_is_hyphens = 'is_hyphens' |
|
|
f_is_punctuation = 'is_punctuation' |
|
|
f_is_cjk_character = 'is_cjk_character' |
|
|
f_is_jap_character = 'is_jap_character' |
|
|
f_is_russian_character = 'is_russian_character' |
|
|
|
|
|
@classmethod |
|
|
def is_alnum(cls, ch: str): |
|
|
"""注意: string.isalnum() 函数, 会对汉字识别为 True. """ |
|
|
if cls.is_cjk_character(ch): |
|
|
return False |
|
|
if ch.isalnum(): |
|
|
return True |
|
|
return False |
|
|
|
|
|
@classmethod |
|
|
def is_alpha(cls, ch: str): |
|
|
if cls.is_cjk_character(ch): |
|
|
return False |
|
|
if ch.isalpha(): |
|
|
return True |
|
|
return False |
|
|
|
|
|
@staticmethod |
|
|
def is_control(ch): |
|
|
"""控制类字符判断""" |
|
|
if ch in ('\t', '\n', '\r'): |
|
|
return False |
|
|
return unicodedata.category(ch) in ("Cc", "Cf") |
|
|
|
|
|
@classmethod |
|
|
def is_num(cls, ch: str): |
|
|
if cls.is_cjk_character(ch): |
|
|
return False |
|
|
if ch.isdigit(): |
|
|
return True |
|
|
return False |
|
|
|
|
|
@classmethod |
|
|
def is_space(cls, ch): |
|
|
"""空格类字符判断""" |
|
|
if ch in (" ", '\n', '\r', '\t'): |
|
|
return True |
|
|
if unicodedata.category(ch) == 'Zs': |
|
|
return True |
|
|
return False |
|
|
|
|
|
@classmethod |
|
|
def is_hyphens(cls, ch): |
|
|
""" |
|
|
是否为连字符, `-` 匹配减号. |
|
|
+ : 43 |
|
|
- : 45 |
|
|
""" |
|
|
code = ord(ch) |
|
|
if code in (43, 45): |
|
|
return True |
|
|
return False |
|
|
|
|
|
@classmethod |
|
|
def is_punctuation(cls, ch): |
|
|
"""标点符号类字符判断(全/半角均在此内)""" |
|
|
code = ord(ch) |
|
|
if 33 <= code <= 47 or \ |
|
|
58 <= code <= 64 or \ |
|
|
91 <= code <= 96 or \ |
|
|
123 <= code <= 126 or \ |
|
|
unicodedata.category(ch).startswith("P"): |
|
|
return True |
|
|
return False |
|
|
|
|
|
@classmethod |
|
|
def is_cjk_character(cls, ch): |
|
|
"""CJK类字符判断(包括中文字符也在此列) |
|
|
参考:https://en.wikipedia.org/wiki/CJK_Unified_Ideographs_(Unicode_block) |
|
|
""" |
|
|
code = ord(ch) |
|
|
|
|
|
if 0x4E00 <= code <= 0x9FFF or \ |
|
|
0x3400 <= code <= 0x4DBF or \ |
|
|
0x20000 <= code <= 0x2A6DF or \ |
|
|
0x2A700 <= code <= 0x2B73F or \ |
|
|
0x2B740 <= code <= 0x2B81F or \ |
|
|
0x2B820 <= code <= 0x2CEAF or \ |
|
|
0xF900 <= code <= 0xFAFF or \ |
|
|
0x2F800 <= code <= 0x2FA1F: |
|
|
return True |
|
|
return False |
|
|
|
|
|
@classmethod |
|
|
def is_jap_character(cls, ch): |
|
|
code = ord(ch) |
|
|
if 0x3040 <= code <= 0x309F or \ |
|
|
0x30A0 <= code <= 0x30FF or \ |
|
|
0x31F0 <= code <= 0x31FF: |
|
|
return True |
|
|
return False |
|
|
|
|
|
@classmethod |
|
|
def is_russian_character(cls, ch): |
|
|
code = ord(ch) |
|
|
if 1040 <= code <= 1104: |
|
|
return True |
|
|
return False |
|
|
|
|
|
@staticmethod |
|
|
def convert_to_unicode(text): |
|
|
"""Converts `text` to Unicode (if it's not already), assuming utf-8 input.""" |
|
|
if six.PY3: |
|
|
if isinstance(text, str): |
|
|
return text |
|
|
elif isinstance(text, bytes): |
|
|
return text.decode("utf-8", "ignore") |
|
|
else: |
|
|
raise ValueError("Unsupported string type: %s" % (type(text))) |
|
|
elif six.PY2: |
|
|
if isinstance(text, str): |
|
|
return text.decode("utf-8", "ignore") |
|
|
elif isinstance(text, unicode): |
|
|
return text |
|
|
else: |
|
|
raise ValueError("Unsupported string type: %s" % (type(text))) |
|
|
else: |
|
|
raise ValueError("Not running on Python2 or Python 3?") |
|
|
|
|
|
|
|
|
class LowerCase(object): |
|
|
confuse_map = { |
|
|
|
|
|
'Й': 'И', |
|
|
'й': 'и', |
|
|
'ѐ': 'е', |
|
|
'ё': 'е', |
|
|
'ѓ': 'г', |
|
|
'ї': 'і', |
|
|
|
|
|
|
|
|
'á': 'a', |
|
|
'é': 'e', |
|
|
'í': 'i', |
|
|
'ó': 'o', |
|
|
'ú': 'u', |
|
|
'ü': 'u', |
|
|
'ñ': 'n', |
|
|
} |
|
|
|
|
|
@classmethod |
|
|
def lowercase(cls, string): |
|
|
"""转小写不应改变字符串的长度""" |
|
|
string = str(string).lower() |
|
|
result = '' |
|
|
for c in string: |
|
|
code = ord(c) |
|
|
|
|
|
|
|
|
if 1040 <= code <= 1072: |
|
|
c = chr(ord(c) + 32) |
|
|
|
|
|
|
|
|
flag = cls.confuse_map.get(c) |
|
|
if flag is not None: |
|
|
c = flag |
|
|
|
|
|
result += c |
|
|
|
|
|
if len(string) != len(result): |
|
|
raise AssertionError('this method should not change the char num. ' |
|
|
'string: {}, result: {}'.format(string, result)) |
|
|
return result |
|
|
|
|
|
|
|
|
class Pattern(object): |
|
|
r""" |
|
|
\d 匹配任意数字符, 等价于 [0-9]. |
|
|
\s 匹配任意空白字符, 等价于 [\t\n\r\f] 包括空隔. |
|
|
re{n,m} 匹配 n 到 m 次由前面的正则表达式定义的片段, 贪婪方式. |
|
|
""" |
|
|
alp_num_ch = r'[A-Z0-9a-z\u4e00-\u9fa5]+' |
|
|
alp_num_or_others = r'[^A-Z0-9a-z]|[A-Z0-9a-z]+' |
|
|
brackets = r'\(.*?\)' |
|
|
hw_ry_xy = r'华为|荣耀|小艺' |
|
|
p_pattern = r'[a-z]\d{1,2}\s+p\d{1,2}' |
|
|
pro_pattern = r'([a-z]+\s*\d{1,2})\s+(p\d{1,2})' |
|
|
any_blanks = r'\s+' |
|
|
square_brackets = r'\[.*?\]' |
|
|
regex_dsw_find = r'\\\\[dDsSwW][\+\*]?' |
|
|
|
|
|
|
|
|
class ValidPeriod(object): |
|
|
"""有效期""" |
|
|
l_compare_lt = 'l_compare_lt' |
|
|
l_compare_gt = 'l_compare_gt' |
|
|
l_time = 'l_time' |
|
|
l_time_unit = 'l_time_unit' |
|
|
|
|
|
|
|
|
l_compare_lt_prefix_regex = rf'?<{l_compare_lt}>不超过|没到|不到|少于' |
|
|
l_compare_gt_prefix_regex = rf'?<{l_compare_gt}>超|超过|超过了|大于|不止' |
|
|
|
|
|
l_compare_lt_suffix_regex = rf'?<{l_compare_lt}>没到|不到|内|以内|之内' |
|
|
l_compare_gt_suffix_regex = rf'?<{l_compare_gt}>以上|不止' |
|
|
|
|
|
l_time_regex = rf'?<{l_time}>[两|壹|零|一|二|三|四|五|六|七|八|九|十|百|千\d]+' |
|
|
|
|
|
l_time_unit_regex = rf'?<{l_time_unit}>(?:个)?年|个月|周|天|日|星期|个星期' |
|
|
|
|
|
|
|
|
pattern1 = rf'(?:({l_compare_lt_prefix_regex})|({l_compare_gt_prefix_regex}))?\s*({l_time_regex})\s*({l_time_unit_regex})\s*(?:({l_compare_lt_suffix_regex})|({l_compare_gt_suffix_regex}))?' |
|
|
|
|
|
|
|
|
pass |
|
|
|
|
|
@staticmethod |
|
|
def demo1(): |
|
|
""" |
|
|
例句: |
|
|
一个星期, 超七天后, 七日内, 第七天, 超过了七天, 不止5天 |
|
|
|
|
|
# 以下句子都是从标注数据中找出的有效期, 将来也许需要处理这些. |
|
|
刚买2天, 昨天取的, 昨天到货, 签收后的第二天, 签收后七天内, 前两天, 货还没发, 用了几天 |
|
|
:return: |
|
|
""" |
|
|
|
|
|
string = "5天不止" |
|
|
|
|
|
ret = ValidPeriod.valid_period_parse(string) |
|
|
print(ret) |
|
|
return |
|
|
|
|
|
@staticmethod |
|
|
def time_convert(time_string: str): |
|
|
base_num_dict = { |
|
|
'十': 10, |
|
|
'百': 100, |
|
|
'千': 1000, |
|
|
} |
|
|
d = { |
|
|
'壹': 1, |
|
|
'两': 2, |
|
|
'零': 0, |
|
|
'一': 1, |
|
|
'二': 2, |
|
|
'三': 3, |
|
|
'四': 4, |
|
|
'五': 5, |
|
|
'六': 6, |
|
|
'七': 7, |
|
|
'八': 8, |
|
|
'九': 9, |
|
|
} |
|
|
result = 0 |
|
|
tmp = '' |
|
|
for c in time_string: |
|
|
if c.isdecimal(): |
|
|
tmp += c |
|
|
continue |
|
|
|
|
|
base_num = base_num_dict.get(c, None) |
|
|
if base_num is not None: |
|
|
if len(tmp) == 0: |
|
|
result += base_num |
|
|
elif len(tmp) == 1: |
|
|
result += base_num * int(tmp) |
|
|
print(result) |
|
|
tmp = '' |
|
|
else: |
|
|
pass |
|
|
else: |
|
|
tmp += str(d.get(c, '')) |
|
|
else: |
|
|
result += int(tmp) |
|
|
return result |
|
|
|
|
|
@staticmethod |
|
|
def time_unit_convert(time_unit_string: str): |
|
|
d = { |
|
|
'天': 1, |
|
|
'日': 1, |
|
|
'周': 7, |
|
|
'星期': 7, |
|
|
'个星期': 7, |
|
|
'个月': 30, |
|
|
'年': 365, |
|
|
} |
|
|
result = d.get(time_unit_string, 1) |
|
|
return result |
|
|
|
|
|
@staticmethod |
|
|
def get_pattern_label(pattern: str): |
|
|
""" |
|
|
子正则表达式都包含了一个标签, |
|
|
:param pattern: |
|
|
:return: |
|
|
""" |
|
|
pattern_inner = re.compile(r'\?<(.*?)>') |
|
|
label_name_list = re.findall(pattern=pattern_inner, string=pattern) |
|
|
return label_name_list |
|
|
|
|
|
@staticmethod |
|
|
def clean_pattern_label(pattern: str): |
|
|
pattern_inner = re.compile(r'\?<.*?>') |
|
|
result = re.sub(pattern=pattern_inner, repl='', string=pattern) |
|
|
return result |
|
|
|
|
|
@classmethod |
|
|
def valid_period_parse(cls, string: str) -> (int, dict) or (None, dict): |
|
|
"""cls.pattern1""" |
|
|
label_name_list, label_string_list = cls._search_label_list(string, cls.pattern1) |
|
|
days, detail = cls._estimate_days(label_name_list, label_string_list) |
|
|
return days, detail |
|
|
|
|
|
@classmethod |
|
|
def _estimate_days(cls, label_name_list, label_string_list) -> (int, dict) or (None, dict): |
|
|
"""当一个标签都没有识别到时, 两 list 为空. 返回结果为 0. """ |
|
|
bias = 0 |
|
|
scale = 1 |
|
|
main_time = 0 |
|
|
|
|
|
for label_name, label_string in zip(label_name_list, label_string_list): |
|
|
if label_name == cls.l_compare_lt: |
|
|
|
|
|
pass |
|
|
elif label_name == cls.l_compare_gt: |
|
|
bias = 1 |
|
|
elif label_name == cls.l_time: |
|
|
main_time += cls.time_convert(label_string) |
|
|
elif label_name == cls.l_time_unit: |
|
|
scale = cls.time_unit_convert(label_string) |
|
|
else: |
|
|
pass |
|
|
days = main_time * scale + bias |
|
|
detail = { |
|
|
'main_time': main_time, |
|
|
'scale': scale, |
|
|
'bias': bias |
|
|
} |
|
|
|
|
|
if len(label_name_list) == 0 or len(label_string_list) == 0: |
|
|
return None, detail |
|
|
|
|
|
return days, detail |
|
|
|
|
|
@classmethod |
|
|
def _search_label_list(cls, string: str, pattern: str) -> (list, list): |
|
|
label_name_list = cls.get_pattern_label(pattern) |
|
|
pattern = cls.clean_pattern_label(pattern) |
|
|
match = re.search(pattern=pattern, string=string) |
|
|
if match is None: |
|
|
return list(), list() |
|
|
label_string_list = match.groups() |
|
|
|
|
|
new_label_name_list, new_label_string_list = list(), list() |
|
|
for label_name, label_string in zip(label_name_list, label_string_list): |
|
|
if label_string is None: |
|
|
continue |
|
|
new_label_name_list.append(label_name) |
|
|
new_label_string_list.append(label_string) |
|
|
|
|
|
return new_label_name_list, new_label_string_list |
|
|
|
|
|
def __init__(self): |
|
|
pass |
|
|
|