참고 답변
This question is checking whether you can implement a realistic windowed aggregation with correct ordering, not just count things. You need to parse timestamps reliably, filter by a time window, maintain counts and a stable tie break (first click time), and then compute top $k$ efficiently. A heap or sort is fine depending on $n$ and $k$, but correctness under messy input and clear complexity reasoning matter more. Most people fail on boundary conditions at the window edges and tie-breaking logic.
1from __future__ import annotations
2
3import json
4from dataclasses import dataclass
5from datetime import datetime, timedelta, timezone
6from typing import Dict, Iterable, List, Optional, Tuple
7
8
9def _parse_rfc3339(ts: str) -> datetime:
10 """Parse a RFC3339 timestamp into a timezone-aware datetime.
11
12 Supports 'Z' suffix and offsets like '+00:00'.
13 Raises ValueError on invalid formats.
14 """
15 ts = ts.strip()
16 if ts.endswith("Z"):
17 ts = ts[:-1] + "+00:00"
18 dt = datetime.fromisoformat(ts)
19 if dt.tzinfo is None:
20 # Treat naive timestamps as UTC to avoid silent local-time bugs.
21 dt = dt.replace(tzinfo=timezone.utc)
22 return dt
23
24
25@dataclass
26class _UserAgg:
27 clicks: int = 0
28 first_click_time: Optional[datetime] = None
29
30
31def top_k_click_users(
32 json_lines: Iterable[str],
33 k: int,
34 t_minutes: int,
35 reference_time_rfc3339: str,
36) -> List[Tuple[str, int]]:
37 """Return top-k (user_id, click_count) in the last T minutes.
38
39 Window is (reference_time - T minutes, reference_time], inclusive on end.
40 Ties: earlier first click_time, then lexicographic user_id.
41
42 Invalid JSON or missing fields are skipped.
43 """
44 if k <= 0 or t_minutes < 0:
45 return []
46
47 ref = _parse_rfc3339(reference_time_rfc3339)
48 window_start = ref - timedelta(minutes=t_minutes)
49
50 agg: Dict[str, _UserAgg] = {}
51
52 for line in json_lines:
53 try:
54 obj = json.loads(line)
55 except (TypeError, json.JSONDecodeError):
56 continue
57
58 user_id = obj.get("user_id")
59 event_type = obj.get("event_type")
60 event_time = obj.get("event_time")
61
62 if not isinstance(user_id, str) or event_type != "click" or not isinstance(event_time, str):
63 continue
64
65 try:
66 ts = _parse_rfc3339(event_time)
67 except ValueError:
68 continue
69
70 # Define window as (start, end] to match common streaming semantics.
71 if not (window_start < ts <= ref):
72 continue
73
74 ua = agg.get(user_id)
75 if ua is None:
76 ua = _UserAgg()
77 agg[user_id] = ua
78
79 ua.clicks += 1
80 if ua.first_click_time is None or ts < ua.first_click_time:
81 ua.first_click_time = ts
82
83 # Build sortable tuples with deterministic tie breaks.
84 items: List[Tuple[int, datetime, str]] = []
85 for uid, ua in agg.items():
86 if ua.clicks <= 0 or ua.first_click_time is None:
87 continue
88 # Sort key: highest clicks, then earliest first click, then uid.
89 items.append((-ua.clicks, ua.first_click_time, uid))
90
91 items.sort()
92
93 out: List[Tuple[str, int]] = []
94 for neg_clicks, _, uid in items[:k]:
95 out.append((uid, -neg_clicks))
96 return out
97