scoring.py 1.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051
  1. """
  2. Calculate simscore, refer to (https://github.com/VikParuchuri/marker?tab=readme-ov-file)
  3. """
  4. import math
  5. from rapidfuzz import fuzz
  6. import re
  7. import regex
  8. from statistics import mean
  9. CHUNK_MIN_CHARS = 25
  10. def chunk_text(text, chunk_len=500):
  11. chunks = [text[i:i+chunk_len] for i in range(0, len(text), chunk_len)]
  12. chunks = [c for c in chunks if c.strip() and len(c) > CHUNK_MIN_CHARS]
  13. return chunks
  14. def overlap_score(hypothesis_chunks, reference_chunks):
  15. if len(reference_chunks) > 0:
  16. length_modifier = len(hypothesis_chunks) / len(reference_chunks)
  17. else:
  18. length_modifier = 0
  19. search_distance = max(len(reference_chunks) // 5, 10)
  20. chunk_scores = []
  21. for i, hyp_chunk in enumerate(hypothesis_chunks):
  22. max_score = 0
  23. total_len = 0
  24. i_offset = int(i * length_modifier)
  25. chunk_range = range(max(0, i_offset-search_distance), min(len(reference_chunks), i_offset+search_distance))
  26. for j in chunk_range:
  27. ref_chunk = reference_chunks[j]
  28. score = fuzz.ratio(hyp_chunk, ref_chunk, score_cutoff=30) / 100
  29. if score > max_score:
  30. max_score = score
  31. total_len = len(ref_chunk)
  32. chunk_scores.append(max_score)
  33. return chunk_scores
  34. def score_text(hypothesis, reference):
  35. # Returns a 0-1 alignment score
  36. hypothesis_chunks = chunk_text(hypothesis)
  37. reference_chunks = chunk_text(reference)
  38. chunk_scores = overlap_score(hypothesis_chunks, reference_chunks)
  39. if len(chunk_scores) > 0:
  40. mean_score = mean(chunk_scores)
  41. return mean_score
  42. else:
  43. return 0
  44. #return mean(chunk_scores)