score:1

Accepted answer

here's an approach to preprocess your input with fuzzy-match, which will then be used as input by your existing code.

the idea is to first generate 2-combinations of your input tuples, fuzzy-match them to create a map of distinct sets consisting of the matched values per key, and finally use the map to fuzzy-match your original input.

to make sure more arbitrary cases are covered, i've expanded your input:

val input = list(
  ("a", "10 in"), ("a", "15 in"), ("a", "10 inches"), ("a", "15 inches"), ("a", "15.00 inches"),
  ("b", "2 cm"), ("b", "4 cm"), ("b", "2.00 cm"),
  ("c", "7 cm"), ("c", "7 in")
)

// trivialized fuzzy match
def fuzzymatch(s1: string, s2: string): boolean = {
  val st1 = s1.tolowercase.replace(".00", "").replace("inches", "in")
  val st2 = s2.tolowercase.replace(".00", "").replace("inches", "in")
  st1 == st2
}

// create a map of sets of fuzzy-matched values from all 2-combinations per key
val fuzmap = input.combinations(2).foldleft( map[string, seq[set[string]]]() ){
  case (m, seq(t1: tuple2[string, string], t2: tuple2[string, string])) =>
    if (fuzzymatch(t1._2, t2._2)) {
      val fuzsets = m.getorelse(t1._1, seq(set(t1._2, t2._2))).map(
        x => if (x.contains(t1._2) || x.contains(t2._2)) x ++ set(t1._2, t2._2) else x
      )
      if (!fuzsets.flatten.contains(t1._2) && !fuzsets.flatten.contains(t2._2))
        m + (t1._1 -> (fuzsets :+ set(t1._2, t2._2)))
      else
        m + (t1._1 -> fuzsets)
    }
    else
      m
}
// fuzmap: scala.collection.immutable.map[string,seq[set[string]]] = map(
//   a -> list(set(10 in, 10 inches), set(15 in, 15 inches, 15.00 inches)), 
//   b -> list(set(2 cm, 2.00 cm)))
// )

note that for large input, it might make sense to first groupby key and generate 2-combinations per key.

next step would be to fuzzy-match the original input using the created map:

// fuzzy-match original input using fuzmap
val fuzinput = input.map{ case (k, v) => 
  if (fuzmap.get(k).isdefined) {
    val fuzvalues = fuzmap(k).map{
      case x => if (x.contains(v)) some(x.min) else none
    }.flatten
    if (!fuzvalues.isempty)
      (k, fuzvalues.head)
    else
      (k, v)
  }
  else
    (k, v)
}
// fuzinput: list[(string, string)] = list(
//   (a,10 in), (a,15 inches), (a,10 in), (a,15 inches), (a,15 inches),
//   (b,2 cm), (b,4 cm), (b,2 cm),
//   (c,7 cm), (c,7 in)
// )

score:2

if for some reason approach with converting to numerical values doesn't work for you, here is a code that seems to do what you want:

def fuzzymatch(s1: string, s2: string): boolean = {
  // fake implementation
  val matches = list(("15 inches", "15.00 inches"), ("2 cm", "2.00 cm"))
  s1.equals(s2) || matches.exists({
    case (m1, m2) => (m1.equals(s1) && m2.equals(s2)) || (m1.equals(s2) && m2.equals(s1))
  })
}

 def test(): unit = {
  val input = list(("a", "15 inches"), ("a", "15.00 inches"), ("a", "10 in"), ("b", "2 cm"), ("b", "2.00 cm"))
  val bykey = input.groupby(_._1).mapvalues(l => l.map(_._2))
  val totaloccurrences = bykey.mapvalues(_.size)
  val maxbykey = bykey.mapvalues(_.head) //random "max" selection logic

  val processedinput: list[(string, string, double)] = maxbykey.map({
    case (mk, mv) =>
      val matchcount = bykey(mk).count(tv => fuzzymatch(tv, mv))
      (mk, mv, matchcount / totaloccurrences(mk).asinstanceof[double])
  })(breakout)

  println(processedinput)
}

this prints

list((b,2 cm,1.0), (a,15 inches,0.6666666666666666))


Related Query

More Query from same tag