-
Notifications
You must be signed in to change notification settings - Fork 9
Populate sender email and recipients in threads output #78
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -12,6 +12,7 @@ import ( | |
| var ( | ||
| entryBlockRe = regexp.MustCompile(`(?s)data-entry-id="(\d+)"`) | ||
| senderRe = regexp.MustCompile(`id="sender_entry_(\d+)"[^>]*>\s*([^<]+?)\s*<`) | ||
| senderEmailRe = regexp.MustCompile(`(?s)sender_entry_(\d+).*?entry__sender-email[^>]*><span[^>]*>[^<]*</span>([^<]+)<`) | ||
| timeRe = regexp.MustCompile(`<time[^>]*datetime="([^"]+)"`) | ||
| srcdocRe = regexp.MustCompile(`(?s)srcdoc="([^"]*trix-content[^"]*)"`) | ||
| fullRecipientsRe = regexp.MustCompile(`(?s)entry__full-recipients[^>]*>(.*?)</span>`) | ||
|
|
@@ -86,6 +87,12 @@ func ParseTopicEntriesHTML(html string) []models.Entry { | |
| senders[m[1]] = m[2] | ||
| } | ||
| } | ||
| senderEmails := map[string]string{} | ||
| for _, m := range senderEmailRe.FindAllStringSubmatch(html, -1) { | ||
| if _, exists := senderEmails[m[1]]; !exists { | ||
| senderEmails[m[1]] = strings.TrimSpace(m[2]) | ||
| } | ||
| } | ||
|
|
||
| // Associate times with entries by finding the first <time> after each entry anchor | ||
| entryTimes := map[string]string{} | ||
|
|
@@ -100,6 +107,35 @@ func ParseTopicEntriesHTML(html string) []models.Entry { | |
| } | ||
| } | ||
|
|
||
| // Associate recipients with entries by slicing between entry anchors. | ||
| entryRecipients := map[string][]models.Contact{} | ||
| for i, eid := range entryIDs { | ||
| anchor := fmt.Sprintf(`id="entry_%s"`, eid) | ||
| start := strings.Index(html, anchor) | ||
| if start < 0 { | ||
| continue | ||
| } | ||
| end := len(html) | ||
| if i+1 < len(entryIDs) { | ||
| nextAnchor := fmt.Sprintf(`id="entry_%s"`, entryIDs[i+1]) | ||
| if n := strings.Index(html[start:], nextAnchor); n > 0 { | ||
| end = start + n | ||
| } | ||
| } | ||
|
Comment on lines
+112
to
+124
|
||
| m := fullRecipientsRe.FindStringSubmatch(html[start:end]) | ||
| if m == nil { | ||
| continue | ||
| } | ||
| seen := map[string]bool{} | ||
| for _, addr := range extractEmails(m[1]) { | ||
| if seen[addr] { | ||
| continue | ||
| } | ||
| seen[addr] = true | ||
| entryRecipients[eid] = append(entryRecipients[eid], models.Contact{EmailAddress: addr}) | ||
| } | ||
| } | ||
|
Comment on lines
+110
to
+137
|
||
|
|
||
| // Extract bodies from srcdoc iframes - they appear in entry order | ||
| type body struct{ html, text string } | ||
| bodyMatches := srcdocRe.FindAllStringSubmatch(html, -1) | ||
|
|
@@ -122,7 +158,10 @@ func ParseTopicEntriesHTML(html string) []models.Entry { | |
| CreatedAt: entryTimes[eid], | ||
| } | ||
| if name, ok := senders[eid]; ok { | ||
| e.Creator = models.Contact{Name: name} | ||
| e.Creator = models.Contact{Name: name, EmailAddress: senderEmails[eid]} | ||
| } | ||
| if recips, ok := entryRecipients[eid]; ok { | ||
| e.Recipients = recips | ||
| } | ||
| if i < len(bodies) { | ||
| e.Body = bodies[i].text | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
senderEmailReis very loosely scoped: it matchessender_entry_(\d+)and then uses.*?with DOTALL to find the nextentry__sender-emailanywhere later in the document. If any sender block is missing the expectedentry__sender-emailmarkup (or ifsender_entry_###appears outside the sender element), this can mis-associate an email with the wrong entry ID. Consider tightening the regex to anchor onid="sender_entry_(\d+)"and constrain the match to within the sender element (e.g., stop at</a>), or extract the sender block first and then parse the email within that substring.