score:1

perhaps this is not the best theoretical approach to work with millions of records. however, this is working and can be used as a starting point for further improvements.

class program
{
    static void main(string[] args)
    {
        var startingrecords = new list<record>()
        {
            new record(1001930, "a", "in"),
            new record(1004901, "b", "in"),
            new record(1005192, "a", "out"),
            new record(1012933, "a", "in"),
            new record(1014495, "b", "out"),
            new record(1017891, "a", "out"),
        };

        var records = startingrecords.orderby(x => x.badgeid).thenby(x => x.time).tolist();

        var pairs = records.skip(1).zip(records, (second, first) => tuple.create(first, second)).
        where(x => x.item1.badgeid == x.item2.badgeid &&
        x.item1.direction == "in" && x.item2.direction == "out").
        select(x => new pair(x.item1.badgeid, x.item1.time, x.item2.time)).tolist();

        foreach (var pair in pairs)
            console.writeline(pair.badgeid + "\t" + pair.timein + "\t" + pair.timeout);

        console.read();
    }
}

class record
{
    public long time { get; set; }
    public string badgeid { get; set; }
    public string direction { get; set; }

    public record(long time, string badgeid, string direction)
    {
        time = time;
        badgeid = badgeid;
        direction = direction;
    }
}

class pair
{
    public string badgeid { get; set; }
    public long timein { get; set; }
    public long timeout { get; set; }

    public pair(string badgeid, long timein, long timeout)
    {
        badgeid = badgeid;
        timein = timein;
        timeout = timeout;
    }
}

output: a 1001930 1005192 a 1012933 1017891 b 1004901 1014495

score:1

i'm not sure how efficient or performant this would be, but i think it can be translated by linq into sql so if you are using a database, it may push more of the calculation to the server.

first, group the records by the badges:

var p1 = from p in punches
         group p by p.badge into pg
         select new {
             badge = pg.key,
             punches = pg.orderby(p => p.time)
         };

then, for each badge's group of records, go through all the "in" records and match it with the "out" record if it exists:

var p2 = p1.selectmany(pg => pg.punches.where(p => p.dir == "in")
                                       .select(p => new {
                                            pg.badge,
                                            timein = p.time,
                                            timeout = pg.punches.where(po => po.dir == "out" && po.time > p.time)
                                                                .firstordefault().time
                                       }));

finally, order the result:

var ans = p2.orderby(bio => bio.badge).thenby(bio => bio.timein);

since linq to sql propagates nulls automatically, i think this will handle a missing "out" punch for an "in", but not orphan "out" punches.

another possibility is to use the select with two parameters to group the punch records in pairs, but that only works with linq to objects so unless you are filtering the data before processing, the millions of records would all be pulled into memory.

for completeness, here is an attempt at it:

var p2 = p1.asenumerable()
           .selectmany(pg => pg.punches.select((p, i) => (p, i))
                                       .groupby(pi => pi.i / 2, pi => pi.p)
                                       .select(pp => new {
                                            pg.badge,
                                            timein = pp.where(p => p.dir == "in").firstordefault()?.time,
                                            timeout = pp.where(p => p.dir == "out").firstordefault()?.time
                                       }));

none of this will work very well if your punches aren't well ordered, e.g. you are missing an initial "in".


Related Query