|
|
@@ -30,6 +30,18 @@ class DataTypeDetector:
|
|
|
|
|
|
clean_text = re.sub(r'[\s-]', '', text)
|
|
|
|
|
|
+ # 排除日期格式 yyyymmdd
|
|
|
+ if len(clean_text) == 8 and clean_text.isdigit():
|
|
|
+ # 检查是否为合法日期
|
|
|
+ try:
|
|
|
+ year = int(clean_text[:4])
|
|
|
+ month = int(clean_text[4:6])
|
|
|
+ day = int(clean_text[6:8])
|
|
|
+ if 1900 <= year <= 2100 and 1 <= month <= 12 and 1 <= day <= 31:
|
|
|
+ return False # 这是日期,不是文本型数字
|
|
|
+ except ValueError:
|
|
|
+ pass
|
|
|
+
|
|
|
if clean_text.isdigit() and len(clean_text) > 15:
|
|
|
return True
|
|
|
|
|
|
@@ -39,6 +51,37 @@ class DataTypeDetector:
|
|
|
return False
|
|
|
|
|
|
@staticmethod
|
|
|
+ def is_datetime(text: str) -> bool:
|
|
|
+ """判断文本是否为日期时间格式"""
|
|
|
+ if not text:
|
|
|
+ return False
|
|
|
+
|
|
|
+ datetime_patterns = [
|
|
|
+ r'\d{4}[-/]\d{1,2}[-/]\d{1,2}',
|
|
|
+ r'\d{4}[-/]\d{1,2}[-/]\d{1,2}\s*\d{1,2}:\d{1,2}:\d{1,2}',
|
|
|
+ r'\d{4}年\d{1,2}月\d{1,2}日',
|
|
|
+ r'^\d{8}$', # yyyymmdd 格式
|
|
|
+ ]
|
|
|
+
|
|
|
+ for pattern in datetime_patterns:
|
|
|
+ if re.search(pattern, str(text).strip()):
|
|
|
+ # 对于 yyyymmdd 格式,验证日期合法性
|
|
|
+ if pattern == r'^\d{8}$':
|
|
|
+ try:
|
|
|
+ clean_text = str(text).strip()
|
|
|
+ year = int(clean_text[:4])
|
|
|
+ month = int(clean_text[4:6])
|
|
|
+ day = int(clean_text[6:8])
|
|
|
+ if 1900 <= year <= 2100 and 1 <= month <= 12 and 1 <= day <= 31:
|
|
|
+ return True
|
|
|
+ except (ValueError, IndexError):
|
|
|
+ continue
|
|
|
+ else:
|
|
|
+ return True
|
|
|
+
|
|
|
+ return False
|
|
|
+
|
|
|
+ @staticmethod
|
|
|
def parse_number(text: str) -> float:
|
|
|
"""解析数字,处理千分位和货币符号"""
|
|
|
if not text:
|
|
|
@@ -65,17 +108,33 @@ class DataTypeDetector:
|
|
|
def extract_datetime(text: str) -> str:
|
|
|
"""提取并标准化日期时间"""
|
|
|
patterns = [
|
|
|
+ # yyyy-mm-dd hh:mm:ss
|
|
|
(r'(\d{4})[-/](\d{1,2})[-/](\d{1,2})\s*(\d{1,2}):(\d{1,2}):(\d{1,2})',
|
|
|
lambda m: f"{m.group(1)}-{m.group(2).zfill(2)}-{m.group(3).zfill(2)} {m.group(4).zfill(2)}:{m.group(5).zfill(2)}:{m.group(6).zfill(2)}"),
|
|
|
+ # yyyy-mm-dd
|
|
|
(r'(\d{4})[-/](\d{1,2})[-/](\d{1,2})',
|
|
|
lambda m: f"{m.group(1)}-{m.group(2).zfill(2)}-{m.group(3).zfill(2)}"),
|
|
|
+ # yyyy年mm月dd日
|
|
|
(r'(\d{4})年(\d{1,2})月(\d{1,2})日',
|
|
|
lambda m: f"{m.group(1)}-{m.group(2).zfill(2)}-{m.group(3).zfill(2)}"),
|
|
|
+ # yyyymmdd
|
|
|
+ (r'^(\d{4})(\d{2})(\d{2})$',
|
|
|
+ lambda m: f"{m.group(1)}{m.group(2)}{m.group(3)}"),
|
|
|
]
|
|
|
|
|
|
for pattern, formatter in patterns:
|
|
|
- match = re.search(pattern, text)
|
|
|
+ match = re.search(pattern, str(text).strip())
|
|
|
if match:
|
|
|
+ if pattern == r'^(\d{4})(\d{2})(\d{2})$':
|
|
|
+ # 验证日期合法性
|
|
|
+ try:
|
|
|
+ year = int(match.group(1))
|
|
|
+ month = int(match.group(2))
|
|
|
+ day = int(match.group(3))
|
|
|
+ if not (1900 <= year <= 2100 and 1 <= month <= 12 and 1 <= day <= 31):
|
|
|
+ continue
|
|
|
+ except ValueError:
|
|
|
+ continue
|
|
|
return formatter(match)
|
|
|
|
|
|
return text
|
|
|
@@ -90,28 +149,16 @@ class DataTypeDetector:
|
|
|
if not non_empty_values:
|
|
|
return 'text'
|
|
|
|
|
|
+ # 先检测日期时间(优先级最高,避免 yyyymmdd 被误判为文本型数字)
|
|
|
+ datetime_count = sum(1 for v in non_empty_values[:5] if DataTypeDetector.is_datetime(v))
|
|
|
+ if datetime_count >= len(non_empty_values[:5]) * 0.6:
|
|
|
+ return 'datetime'
|
|
|
+
|
|
|
# 检测文本型数字
|
|
|
text_number_count = sum(1 for v in non_empty_values[:5] if DataTypeDetector.is_text_number(v))
|
|
|
if text_number_count >= len(non_empty_values[:5]) * 0.6:
|
|
|
return 'text'
|
|
|
|
|
|
- # 检测日期时间
|
|
|
- datetime_patterns = [
|
|
|
- r'\d{4}[-/]\d{1,2}[-/]\d{1,2}',
|
|
|
- r'\d{4}[-/]\d{1,2}[-/]\d{1,2}\s*\d{1,2}:\d{1,2}:\d{1,2}',
|
|
|
- r'\d{4}年\d{1,2}月\d{1,2}日',
|
|
|
- ]
|
|
|
-
|
|
|
- datetime_count = 0
|
|
|
- for value in non_empty_values[:5]:
|
|
|
- for pattern in datetime_patterns:
|
|
|
- if re.search(pattern, value):
|
|
|
- datetime_count += 1
|
|
|
- break
|
|
|
-
|
|
|
- if datetime_count >= len(non_empty_values[:5]) * 0.6:
|
|
|
- return 'datetime'
|
|
|
-
|
|
|
# 检测数字
|
|
|
numeric_count = sum(1 for v in non_empty_values[:5]
|
|
|
if DataTypeDetector.is_numeric(v) and not DataTypeDetector.is_text_number(v))
|