Spaces:

intelli-zen
/

document_loaders

Paused

App Files Files Community

document_loaders / toolbox /string /character.py

HoneyTian

first commit

e94100d about 1 year ago

raw

history blame contribute delete

11.8 kB

	#!/usr/bin/python3
	# -- coding: utf-8 --
	import re
	import unicodedata
	import six


	class Character(object):
	f_unknown = 'unknown'
	f_is_alnum = 'is_alnum'
	f_is_alpha = 'is_alpha'
	f_is_num = 'is_num'
	f_is_space = 'is_space'
	f_is_hyphens = 'is_hyphens'
	f_is_punctuation = 'is_punctuation'
	f_is_cjk_character = 'is_cjk_character'
	f_is_jap_character = 'is_jap_character'
	f_is_russian_character = 'is_russian_character'

	@classmethod
	def is_alnum(cls, ch: str):
	"""注意: string.isalnum() 函数, 会对汉字识别为 True. """
	if cls.is_cjk_character(ch):
	return False
	if ch.isalnum():
	return True
	return False

	@classmethod
	def is_alpha(cls, ch: str):
	if cls.is_cjk_character(ch):
	return False
	if ch.isalpha():
	return True
	return False

	@staticmethod
	def is_control(ch):
	"""控制类字符判断"""
	if ch in ('\t', '\n', '\r'):
	return False
	return unicodedata.category(ch) in ("Cc", "Cf")

	@classmethod
	def is_num(cls, ch: str):
	if cls.is_cjk_character(ch):
	return False
	if ch.isdigit():
	return True
	return False

	@classmethod
	def is_space(cls, ch):
	"""空格类字符判断"""
	if ch in (" ", '\n', '\r', '\t'):
	return True
	if unicodedata.category(ch) == 'Zs':
	return True
	return False

	@classmethod
	def is_hyphens(cls, ch):
	"""
	是否为连字符, `-` 匹配减号.
	+ : 43
	- : 45
	"""
	code = ord(ch)
	if code in (43, 45):
	return True
	return False

	@classmethod
	def is_punctuation(cls, ch):
	"""标点符号类字符判断（全/半角均在此内）"""
	code = ord(ch)
	if 33 <= code <= 47 or \
	58 <= code <= 64 or \
	91 <= code <= 96 or \
	123 <= code <= 126 or \
	unicodedata.category(ch).startswith("P"):
	return True
	return False

	@classmethod
	def is_cjk_character(cls, ch):
	"""CJK类字符判断（包括中文字符也在此列）
	参考：https://en.wikipedia.org/wiki/CJK_Unified_Ideographs_(Unicode_block)
	"""
	code = ord(ch)

	if 0x4E00 <= code <= 0x9FFF or \
	0x3400 <= code <= 0x4DBF or \
	0x20000 <= code <= 0x2A6DF or \
	0x2A700 <= code <= 0x2B73F or \
	0x2B740 <= code <= 0x2B81F or \
	0x2B820 <= code <= 0x2CEAF or \
	0xF900 <= code <= 0xFAFF or \
	0x2F800 <= code <= 0x2FA1F:
	return True
	return False

	@classmethod
	def is_jap_character(cls, ch):
	code = ord(ch)
	if 0x3040 <= code <= 0x309F or \
	0x30A0 <= code <= 0x30FF or \
	0x31F0 <= code <= 0x31FF:
	return True
	return False

	@classmethod
	def is_russian_character(cls, ch):
	code = ord(ch)
	if 1040 <= code <= 1104:
	return True
	return False

	@staticmethod
	def convert_to_unicode(text):
	"""Converts `text` to Unicode (if it's not already), assuming utf-8 input."""
	if six.PY3:
	if isinstance(text, str):
	return text
	elif isinstance(text, bytes):
	return text.decode("utf-8", "ignore")
	else:
	raise ValueError("Unsupported string type: %s" % (type(text)))
	elif six.PY2:
	if isinstance(text, str):
	return text.decode("utf-8", "ignore")
	elif isinstance(text, unicode):
	return text
	else:
	raise ValueError("Unsupported string type: %s" % (type(text)))
	else:
	raise ValueError("Not running on Python2 or Python 3?")


	class LowerCase(object):
	confuse_map = {
	# 俄语
	'Й': 'И',
	'й': 'и',
	'ѐ': 'е',
	'ё': 'е',
	'ѓ': 'г',
	'ї': 'і',

	# 西语
	'á': 'a',
	'é': 'e',
	'í': 'i',
	'ó': 'o',
	'ú': 'u',
	'ü': 'u',
	'ñ': 'n',
	}

	@classmethod
	def lowercase(cls, string):
	"""转小写不应改变字符串的长度"""
	string = str(string).lower()
	result = ''
	for c in string:
	code = ord(c)

	# 俄语转小写.
	if 1040 <= code <= 1072:
	c = chr(ord(c) + 32)

	# 混淆字转换
	flag = cls.confuse_map.get(c)
	if flag is not None:
	c = flag

	result += c

	if len(string) != len(result):
	raise AssertionError('this method should not change the char num. '
	'string: {}, result: {}'.format(string, result))
	return result


	class Pattern(object):
	r"""
	\d 匹配任意数字符, 等价于 [0-9].
	\s 匹配任意空白字符, 等价于 [\t\n\r\f] 包括空隔.
	re{n,m} 匹配 n 到 m 次由前面的正则表达式定义的片段, 贪婪方式.
	"""
	alp_num_ch = r'[A-Z0-9a-z\u4e00-\u9fa5]+' # 提取中文数字字母
	alp_num_or_others = r'[^A-Z0-9a-z]\|[A-Z0-9a-z]+' # 用于在 ' '.join() 中分融数字字母与其它字符.
	brackets = r'\(.*?\)' # 识别括号
	hw_ry_xy = r'华为\|荣耀\|小艺'
	p_pattern = r'[a-z]\d{1,2}\s+p\d{1,2}'
	pro_pattern = r'([a-z]+\s*\d{1,2})\s+(p\d{1,2})'
	any_blanks = r'\s+'
	square_brackets = r'\[.*?\]' # 识别方括号
	regex_dsw_find = r'\\\\[dDsSwW][\+\*]?' # 从如 `\d+左右` 中去除 `\d+`. 用于正则索引的获取.


	class ValidPeriod(object):
	"""有效期"""
	l_compare_lt = 'l_compare_lt'
	l_compare_gt = 'l_compare_gt'
	l_time = 'l_time'
	l_time_unit = 'l_time_unit'

	# 每个子正则表达式 (形如: `<?label>pattern`) 都包含一个标签.
	l_compare_lt_prefix_regex = rf'?<{l_compare_lt}>不超过\|没到\|不到\|少于'
	l_compare_gt_prefix_regex = rf'?<{l_compare_gt}>超\|超过\|超过了\|大于\|不止'

	l_compare_lt_suffix_regex = rf'?<{l_compare_lt}>没到\|不到\|内\|以内\|之内'
	l_compare_gt_suffix_regex = rf'?<{l_compare_gt}>以上\|不止'

	l_time_regex = rf'?<{l_time}>[两\|壹\|零\|一\|二\|三\|四\|五\|六\|七\|八\|九\|十\|百\|千\d]+'
	# l_time_unit_regex = f'?<{l_time_unit}>年\|个月\|周\|天\|星期'
	l_time_unit_regex = rf'?<{l_time_unit}>(?:个)?年\|个月\|周\|天\|日\|星期\|个星期'

	# 正则表达式: 识别 -> 不到十天, 一个星期, 超七天后, 七日内, 第七天, 超过了七天, 不止5天等. 类似的模式.
	pattern1 = rf'(?:({l_compare_lt_prefix_regex})\|({l_compare_gt_prefix_regex}))?\s({l_time_regex})\s({l_time_unit_regex})\s*(?:({l_compare_lt_suffix_regex})\|({l_compare_gt_suffix_regex}))?'

	# 正则表达式: 识别 -> 上个月5号, 这个月14日等. 日期模式
	pass

	@staticmethod
	def demo1():
	"""
	例句:
	一个星期, 超七天后, 七日内, 第七天, 超过了七天, 不止5天

	# 以下句子都是从标注数据中找出的有效期, 将来也许需要处理这些.
	刚买2天, 昨天取的, 昨天到货, 签收后的第二天, 签收后七天内, 前两天, 货还没发, 用了几天
	:return:
	"""

	string = "5天不止"

	ret = ValidPeriod.valid_period_parse(string)
	print(ret)
	return

	@staticmethod
	def time_convert(time_string: str):
	base_num_dict = {
	'十': 10,
	'百': 100,
	'千': 1000,
	}
	d = {
	'壹': 1,
	'两': 2,
	'零': 0,
	'一': 1,
	'二': 2,
	'三': 3,
	'四': 4,
	'五': 5,
	'六': 6,
	'七': 7,
	'八': 8,
	'九': 9,
	}
	result = 0
	tmp = ''
	for c in time_string:
	if c.isdecimal():
	tmp += c
	continue

	base_num = base_num_dict.get(c, None)
	if base_num is not None:
	if len(tmp) == 0:
	result += base_num
	elif len(tmp) == 1:
	result += base_num * int(tmp)
	print(result)
	tmp = ''
	else:
	pass
	else:
	tmp += str(d.get(c, ''))
	else:
	result += int(tmp)
	return result

	@staticmethod
	def time_unit_convert(time_unit_string: str):
	d = {
	'天': 1,
	'日': 1,
	'周': 7,
	'星期': 7,
	'个星期': 7,
	'个月': 30,
	'年': 365,
	}
	result = d.get(time_unit_string, 1)
	return result

	@staticmethod
	def get_pattern_label(pattern: str):
	"""
	子正则表达式都包含了一个标签,
	:param pattern:
	:return:
	"""
	pattern_inner = re.compile(r'\?<(.*?)>')
	label_name_list = re.findall(pattern=pattern_inner, string=pattern)
	return label_name_list

	@staticmethod
	def clean_pattern_label(pattern: str):
	pattern_inner = re.compile(r'\?<.*?>')
	result = re.sub(pattern=pattern_inner, repl='', string=pattern)
	return result

	@classmethod
	def valid_period_parse(cls, string: str) -> (int, dict) or (None, dict):
	"""cls.pattern1"""
	label_name_list, label_string_list = cls._search_label_list(string, cls.pattern1)
	days, detail = cls._estimate_days(label_name_list, label_string_list)
	return days, detail

	@classmethod
	def _estimate_days(cls, label_name_list, label_string_list) -> (int, dict) or (None, dict):
	"""当一个标签都没有识别到时, 两 list 为空. 返回结果为 0. """
	bias = 0
	scale = 1
	main_time = 0

	for label_name, label_string in zip(label_name_list, label_string_list):
	if label_name == cls.l_compare_lt:
	# bias = -1
	pass
	elif label_name == cls.l_compare_gt:
	bias = 1
	elif label_name == cls.l_time:
	main_time += cls.time_convert(label_string)
	elif label_name == cls.l_time_unit:
	scale = cls.time_unit_convert(label_string)
	else:
	pass
	days = main_time * scale + bias
	detail = {
	'main_time': main_time,
	'scale': scale,
	'bias': bias
	}

	if len(label_name_list) == 0 or len(label_string_list) == 0:
	return None, detail

	return days, detail

	@classmethod
	def _search_label_list(cls, string: str, pattern: str) -> (list, list):
	label_name_list = cls.get_pattern_label(pattern)
	pattern = cls.clean_pattern_label(pattern)
	match = re.search(pattern=pattern, string=string)
	if match is None:
	return list(), list()
	label_string_list = match.groups()

	new_label_name_list, new_label_string_list = list(), list()
	for label_name, label_string in zip(label_name_list, label_string_list):
	if label_string is None:
	continue
	new_label_name_list.append(label_name)
	new_label_string_list.append(label_string)

	return new_label_name_list, new_label_string_list

	def __init__(self):
	pass