Skip to content

Commit b980aa0

Browse files
committed
fix: 비정상 경험 판단 로직 추가
1 parent f81d7cd commit b980aa0

1 file changed

Lines changed: 15 additions & 10 deletions

File tree

src/experience/service.py

Lines changed: 15 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -36,8 +36,8 @@
3636
openai_client = AsyncOpenAI(api_key=settings.OPENAI_API_KEY)
3737

3838
# Content quality patterns
39-
_MEANINGFUL_CHAR_PATTERN = re.compile(r'[가-힣a-zA-Z0-9]')
4039
_HANGUL_JAMO_ONLY_PATTERN = re.compile(r'^[ㄱ-ㅎㅏ-ㅣ\s.,!?~ㅋㅎㅠㅜ]+$')
40+
_REPEATED_CHAR_PATTERN = re.compile(r'(.)\1{3,}') # 같은 문자 4번 이상 반복
4141
_MIN_CONTENT_LENGTH = 20 # 본문 최소 글자수 기준
4242

4343

@@ -421,23 +421,28 @@ def _calculate_content_quality(experience: Experience) -> float:
421421

422422
combined = " ".join(content_parts)
423423

424+
total_chars = len(combined.replace(" ", ""))
425+
if total_chars == 0:
426+
return 0.1
427+
424428
# 자모만으로 이루어진 텍스트 (ㅋㅋㅋ, ㅎㅎㅎ 등)
425429
if _HANGUL_JAMO_ONLY_PATTERN.match(combined):
426430
return 0.1
427431

428-
# 의미 있는 문자 비율 (완성된 한글, 영문, 숫자)
429-
meaningful_chars = len(_MEANINGFUL_CHAR_PATTERN.findall(combined))
430-
total_chars = len(combined.replace(" ", ""))
431-
432-
if total_chars == 0:
432+
# 같은 문자 반복 패턴 (aaaa, ㅋㅋㅋㅋ 등)
433+
if _REPEATED_CHAR_PATTERN.search(combined):
433434
return 0.1
434435

435-
quality_ratio = meaningful_chars / total_chars
436+
# 문자 다양성이 극단적으로 낮으면 쓰레기 입력 (abcabc, 123123 등)
437+
unique_ratio = len(set(combined.replace(" ", ""))) / total_chars
438+
if unique_ratio < 0.2:
439+
return 0.1
436440

437-
# 최소 글자수 기준 미만이면 감점
438-
length_factor = min(len(combined) / _MIN_CONTENT_LENGTH, 1.0)
441+
# 최소 글자수 미만이면 소폭 감점, 그 외는 정상
442+
if total_chars < _MIN_CONTENT_LENGTH:
443+
return 0.7
439444

440-
return min(quality_ratio * length_factor, 1.0)
445+
return 1.0
441446

442447

443448
def _calculate_category_matching_score(experience_category: str, question_category: str | None) -> float:

0 commit comments

Comments
 (0)