Skip to content

Commit e35d0b7

Browse files
Thomas StrombergThomas Stromberg
authored andcommitted
slow linkedin down even further
1 parent 0192ad1 commit e35d0b7

File tree

3 files changed

+293
-29
lines changed

3 files changed

+293
-29
lines changed

pkg/cache/cache.go

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,13 @@ const errorTTL = 5 * 24 * time.Hour // Cache HTTP errors for 5 days
1616

1717
// globalRateLimiter enforces minimum delay between requests to the same domain.
1818
// This prevents overwhelming servers even when running concurrent goroutines.
19-
var globalRateLimiter = NewDomainRateLimiter(600 * time.Millisecond)
19+
var globalRateLimiter = newGlobalRateLimiter()
20+
21+
func newGlobalRateLimiter() *DomainRateLimiter {
22+
r := NewDomainRateLimiter(600 * time.Millisecond)
23+
r.SetDomainDelay("www.linkedin.com", 1200*time.Millisecond)
24+
return r
25+
}
2026

2127
// Stats holds cache hit/miss statistics.
2228
type Stats struct {

pkg/cache/ratelimit.go

Lines changed: 24 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
package cache
22

33
import (
4+
"log/slog"
45
"net/url"
56
"sync"
67
"time"
@@ -9,19 +10,27 @@ import (
910
// DomainRateLimiter enforces a minimum delay between requests to the same domain.
1011
// It is safe for concurrent use from multiple goroutines.
1112
type DomainRateLimiter struct {
12-
lastRequest sync.Map // map[string]time.Time
13-
mu sync.Map // map[string]*sync.Mutex - per-domain locks
14-
minDelay time.Duration
13+
domainOverride map[string]time.Duration // per-domain minimum delays
14+
lastRequest sync.Map // map[string]time.Time
15+
mu sync.Map // map[string]*sync.Mutex - per-domain locks
16+
minDelay time.Duration
1517
}
1618

1719
// NewDomainRateLimiter creates a rate limiter that enforces minDelay between
18-
// requests to the same domain.
20+
// requests to the same domain. Domain-specific overrides can be set with SetDomainDelay.
1921
func NewDomainRateLimiter(minDelay time.Duration) *DomainRateLimiter {
2022
return &DomainRateLimiter{
21-
minDelay: minDelay,
23+
minDelay: minDelay,
24+
domainOverride: make(map[string]time.Duration),
2225
}
2326
}
2427

28+
// SetDomainDelay sets a custom minimum delay for a specific domain.
29+
// This overrides the default minDelay for requests to this domain.
30+
func (r *DomainRateLimiter) SetDomainDelay(domain string, delay time.Duration) {
31+
r.domainOverride[domain] = delay
32+
}
33+
2534
// Wait blocks until it's safe to make a request to the given URL's domain.
2635
// It ensures at least minDelay has passed since the last request to that domain.
2736
func (r *DomainRateLimiter) Wait(rawURL string) {
@@ -40,12 +49,20 @@ func (r *DomainRateLimiter) Wait(rawURL string) {
4049
mu.Lock()
4150
defer mu.Unlock()
4251

52+
// Use domain-specific delay if set, otherwise use default
53+
delay := r.minDelay
54+
if override, ok := r.domainOverride[domain]; ok {
55+
delay = override
56+
}
57+
4358
// Check last request time
4459
if lastI, ok := r.lastRequest.Load(domain); ok {
4560
if last, ok := lastI.(time.Time); ok {
4661
elapsed := time.Since(last)
47-
if elapsed < r.minDelay {
48-
time.Sleep(r.minDelay - elapsed)
62+
if elapsed < delay {
63+
waitTime := delay - elapsed
64+
slog.Debug("rate limiting request", "domain", domain, "wait", waitTime.Round(time.Millisecond))
65+
time.Sleep(waitTime)
4966
}
5067
}
5168
}

pkg/linkedin/linkedin.go

Lines changed: 262 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -109,9 +109,18 @@ func New(ctx context.Context, opts ...Option) (*Client, error) {
109109
cfg.logger.InfoContext(ctx, "linkedin client created", "cookie_count", len(cookies))
110110

111111
return &Client{
112-
httpClient: &http.Client{Jar: jar, Timeout: 3 * time.Second},
113-
cache: cfg.cache,
114-
logger: cfg.logger,
112+
httpClient: &http.Client{
113+
Jar: jar,
114+
Timeout: 3 * time.Second,
115+
CheckRedirect: func(_ *http.Request, via []*http.Request) error {
116+
if len(via) >= 1 {
117+
return http.ErrUseLastResponse
118+
}
119+
return nil
120+
},
121+
},
122+
cache: cfg.cache,
123+
logger: cfg.logger,
115124
}, nil
116125
}
117126

@@ -138,26 +147,39 @@ func (c *Client) Fetch(ctx context.Context, urlStr string) (*profile.Profile, er
138147
return nil, fmt.Errorf("request failed: %w", err)
139148
}
140149

141-
prof, parseErr := parseProfile(body, urlStr)
142-
if parseErr != nil {
143-
// Log additional context for debugging
144-
c.logger.DebugContext(ctx, "linkedin parse failed",
145-
"url", urlStr,
146-
"error", parseErr,
147-
"response_size", len(body),
148-
)
149-
return prof, parseErr
150-
}
151-
152-
// Extract URN for API calls
150+
// Extract username from URL for API calls
153151
username := extractPublicID(urlStr)
154-
memberURN := extractMemberURN(body)
152+
153+
// Try to extract the target profile's member URN from the HTML
154+
// IMPORTANT: The HTML contains URNs for both logged-in user and viewed profile
155+
// We must extract the URN for the TARGET profile, not the logged-in user
156+
memberURN := extractTargetMemberURN(body, username)
155157
c.logger.DebugContext(ctx, "extracted for API call", "username", username, "memberURN", memberURN)
156158

157-
// If no employer found from HTML parsing, try the Voyager API
158-
c.logger.DebugContext(ctx, "checking employer", "employer", prof.Fields["employer"])
159+
// PRIMARY: Use Voyager API to get profile data (avoids logged-in user data contamination)
160+
// The HTML often contains the logged-in user's data mixed with viewed profile data
161+
var prof *profile.Profile
162+
if memberURN != "" {
163+
prof = c.fetchProfileFromAPI(ctx, memberURN, urlStr, username)
164+
}
165+
166+
// FALLBACK: Parse HTML only if API failed
167+
if prof == nil {
168+
var parseErr error
169+
prof, parseErr = parseProfile(body, urlStr)
170+
if parseErr != nil {
171+
c.logger.DebugContext(ctx, "linkedin parse failed",
172+
"url", urlStr,
173+
"error", parseErr,
174+
"response_size", len(body),
175+
)
176+
return prof, parseErr
177+
}
178+
}
179+
180+
// Ensure we have experience data
159181
if prof.Fields["employer"] == "" || prof.Fields["title"] == "" {
160-
if username != "" || memberURN != "" {
182+
if memberURN != "" {
161183
exp := c.fetchExperienceFromAPI(ctx, username, memberURN)
162184
if exp.employer != "" && prof.Fields["employer"] == "" {
163185
prof.Fields["employer"] = exp.employer
@@ -170,7 +192,7 @@ func (c *Client) Fetch(ctx context.Context, urlStr string) (*profile.Profile, er
170192
}
171193
}
172194

173-
// If no location found from HTML parsing, try the Voyager API
195+
// Ensure we have location
174196
if prof.Location == "" && memberURN != "" {
175197
loc := c.fetchLocationFromAPI(ctx, memberURN)
176198
if loc != "" {
@@ -179,12 +201,231 @@ func (c *Client) Fetch(ctx context.Context, urlStr string) (*profile.Profile, er
179201
}
180202
}
181203

182-
return prof, parseErr
204+
// Extract social links from HTML (API doesn't provide these)
205+
prof.SocialLinks = htmlutil.SocialLinks(string(body))
206+
extractContactInfo(prof, string(body))
207+
prof.SocialLinks = filterSamePlatformLinks(prof.SocialLinks)
208+
209+
return prof, nil
183210
}
184211

185212
// EnableDebug enables debug logging.
186213
func (c *Client) EnableDebug() { c.debug = true }
187214

215+
// fetchProfileFromAPI fetches the profile data from the LinkedIn Voyager API.
216+
// This is the primary method for getting profile data as it avoids logged-in user data contamination.
217+
// Uses the /identity/profiles/{publicIdentifier} endpoint which returns profile by username.
218+
func (c *Client) fetchProfileFromAPI(ctx context.Context, _, profileURL, username string) *profile.Profile {
219+
if username == "" {
220+
c.logger.DebugContext(ctx, "no username for profile API call")
221+
return nil
222+
}
223+
224+
if err := c.ensureSessionCookies(ctx); err != nil {
225+
c.logger.DebugContext(ctx, "failed to get session cookies for profile", "error", err)
226+
return nil
227+
}
228+
229+
// Use the identity/profiles endpoint which takes publicIdentifier (username) directly
230+
// This avoids the problem of extracting wrong URN from HTML
231+
apiURL := fmt.Sprintf("https://www.linkedin.com/voyager/api/identity/profiles/%s", url.PathEscape(username))
232+
233+
c.logger.DebugContext(ctx, "fetching profile from voyager api", "url", apiURL)
234+
235+
req, err := http.NewRequestWithContext(ctx, http.MethodGet, apiURL, http.NoBody)
236+
if err != nil {
237+
c.logger.DebugContext(ctx, "profile api request creation failed", "error", err)
238+
return nil
239+
}
240+
241+
setVoyagerHeaders(req, c.httpClient, c.logger)
242+
req.Header.Set("Accept", "application/vnd.linkedin.normalized+json+2.1")
243+
244+
body, err := cache.FetchURL(ctx, c.cache, c.httpClient, req, c.logger)
245+
if err != nil {
246+
c.logger.DebugContext(ctx, "profile api request failed", "error", err)
247+
return nil
248+
}
249+
250+
c.logger.DebugContext(ctx, "profile api response", "bodySize", len(body))
251+
252+
return extractProfileFromIdentityAPI(body, profileURL, username, c.logger)
253+
}
254+
255+
// extractProfileFromIdentityAPI extracts profile data from the /identity/profiles/ API response.
256+
// This endpoint returns profile data with fields like firstName, lastName, headline, geoLocationName.
257+
func extractProfileFromIdentityAPI(body []byte, profileURL, username string, logger *slog.Logger) *profile.Profile {
258+
prof := &profile.Profile{
259+
Platform: platform,
260+
URL: profileURL,
261+
Authenticated: true,
262+
Username: username,
263+
Fields: make(map[string]string),
264+
}
265+
266+
// The identity/profiles API returns JSON with direct fields:
267+
// firstName, lastName, headline, geoLocationName, industryName, etc.
268+
269+
// Extract firstName and lastName
270+
firstName := extractJSONField(string(body), "firstName")
271+
lastName := extractJSONField(string(body), "lastName")
272+
if firstName != "" {
273+
prof.Name = unescapeJSON(firstName)
274+
if lastName != "" {
275+
prof.Name += " " + unescapeJSON(lastName)
276+
}
277+
logger.Debug("extracted name from identity API", "name", prof.Name)
278+
}
279+
280+
// Extract headline (bio)
281+
if headline := extractJSONField(string(body), "headline"); headline != "" {
282+
prof.Bio = unescapeJSON(headline)
283+
logger.Debug("extracted headline from identity API", "headline", prof.Bio)
284+
}
285+
286+
// Extract location
287+
if loc := extractJSONField(string(body), "geoLocationName"); loc != "" {
288+
prof.Location = unescapeJSON(loc)
289+
logger.Debug("extracted location from identity API", "location", prof.Location)
290+
}
291+
292+
// Extract pronouns
293+
pronounRe := regexp.MustCompile(`"standardizedPronoun"\s*:\s*"(HE_HIM|SHE_HER|THEY_THEM)"`)
294+
if m := pronounRe.FindSubmatch(body); len(m) > 1 {
295+
pronouns := convertStandardizedPronoun(string(m[1]))
296+
if pronouns != "" {
297+
prof.Fields["pronouns"] = pronouns
298+
logger.Debug("extracted pronouns from identity API", "pronouns", pronouns)
299+
}
300+
}
301+
302+
// If no name found, return nil to fall back to HTML parsing
303+
if prof.Name == "" {
304+
logger.Debug("no name found in identity API response")
305+
return nil
306+
}
307+
308+
return prof
309+
}
310+
311+
// extractProfileFromGraphQLResponse extracts profile data from the TOP_CARD GraphQL response.
312+
// This is a fallback method if the identity/profiles endpoint fails.
313+
func extractProfileFromGraphQLResponse(body []byte, profileURL, username string, logger *slog.Logger) *profile.Profile {
314+
prof := &profile.Profile{
315+
Platform: platform,
316+
URL: profileURL,
317+
Authenticated: true,
318+
Username: username,
319+
Fields: make(map[string]string),
320+
}
321+
322+
// The TOP_CARD response contains the profile name and headline in "text" fields
323+
// Structure: elements containing titleV2 with text for name, subtitleV2 for headline
324+
// Look for patterns like: "titleV2":{"text":{"text":"Stephen Fox Jr."
325+
326+
// Extract name from titleV2
327+
titleRe := regexp.MustCompile(`"titleV2"\s*:\s*\{[^}]*"text"\s*:\s*\{[^}]*"text"\s*:\s*"([^"]+)"`)
328+
if m := titleRe.FindSubmatch(body); len(m) > 1 {
329+
prof.Name = strings.TrimSpace(string(m[1]))
330+
logger.Debug("extracted name from titleV2", "name", prof.Name)
331+
} else {
332+
logger.Debug("titleV2 pattern not found")
333+
}
334+
335+
// Extract headline/bio from subtitleV2
336+
subtitleRe := regexp.MustCompile(`"subtitleV2"\s*:\s*\{[^}]*"text"\s*:\s*\{[^}]*"text"\s*:\s*"([^"]+)"`)
337+
if m := subtitleRe.FindSubmatch(body); len(m) > 1 {
338+
prof.Bio = strings.TrimSpace(string(m[1]))
339+
logger.Debug("extracted bio from subtitleV2", "bio", prof.Bio)
340+
} else {
341+
logger.Debug("subtitleV2 pattern not found")
342+
}
343+
344+
// Extract location
345+
loc := extractLocationFromGraphQLResponse(body)
346+
if loc != "" {
347+
prof.Location = loc
348+
}
349+
350+
// Extract pronouns - look for standardizedPronoun
351+
pronounRe := regexp.MustCompile(`"standardizedPronoun"\s*:\s*"(HE_HIM|SHE_HER|THEY_THEM)"`)
352+
if m := pronounRe.FindSubmatch(body); len(m) > 1 {
353+
pronouns := convertStandardizedPronoun(string(m[1]))
354+
if pronouns != "" {
355+
prof.Fields["pronouns"] = pronouns
356+
}
357+
}
358+
359+
// If no name found, return nil to fall back to HTML parsing
360+
if prof.Name == "" {
361+
return nil
362+
}
363+
364+
return prof
365+
}
366+
367+
// extractTargetMemberURN extracts the member URN for the TARGET profile from HTML.
368+
// This is critical because LinkedIn pages contain URNs for both the logged-in user
369+
// and the profile being viewed. We need to find the URN that belongs to the target.
370+
func extractTargetMemberURN(body []byte, targetUsername string) string {
371+
// Strategy 1: Look for URN associated with the target username in the URL
372+
// Pattern: fsd_profileCard with publicIdentifier matching target
373+
if targetUsername != "" {
374+
// Look for the pattern that ties publicIdentifier to a member URN
375+
// Example: "publicIdentifier":"stephen-fox-jr"... nearby "fsd_profile:ACoA..."
376+
pattern := fmt.Sprintf(`"publicIdentifier"\s*:\s*"%s"[^}]*}[^{]*\{[^}]*fsd_profile:(ACoA[A-Za-z0-9_-]+)`, regexp.QuoteMeta(targetUsername))
377+
re := regexp.MustCompile(pattern)
378+
if m := re.FindSubmatch(body); len(m) > 1 {
379+
return string(m[1])
380+
}
381+
}
382+
383+
// Strategy 2: Look for fsd_profileCard URN which is typically the viewed profile
384+
// Pattern: fsd_profileCard:(ACoA...,SECTION_TYPE
385+
cardRe := regexp.MustCompile(`fsd_profileCard:\((ACoA[A-Za-z0-9_-]+),`)
386+
if match := cardRe.FindSubmatch(body); len(match) > 1 {
387+
return string(match[1])
388+
}
389+
390+
// Strategy 3: Look for profile URN in the page's data
391+
// The viewed profile's URN often appears in specific contexts
392+
profileRe := regexp.MustCompile(`fsd_profile:(ACoA[A-Za-z0-9_-]+)`)
393+
matches := profileRe.FindAllSubmatch(body, -1)
394+
395+
// If we have multiple URNs, we need to identify which is the target
396+
// Usually the most frequently occurring one in certain contexts is the viewed profile
397+
if len(matches) > 0 {
398+
// Count occurrences of each URN
399+
urnCounts := make(map[string]int)
400+
for _, m := range matches {
401+
urn := string(m[1])
402+
urnCounts[urn]++
403+
}
404+
405+
// Return the most common URN (likely the viewed profile)
406+
var maxURN string
407+
maxCount := 0
408+
for urn, count := range urnCounts {
409+
if count > maxCount {
410+
maxCount = count
411+
maxURN = urn
412+
}
413+
}
414+
if maxURN != "" {
415+
return maxURN
416+
}
417+
}
418+
419+
// Last resort: any ACoA pattern
420+
re := regexp.MustCompile(`ACoA[A-Za-z0-9_-]+`)
421+
match := re.Find(body)
422+
if len(match) > 0 {
423+
return string(match)
424+
}
425+
426+
return ""
427+
}
428+
188429
// fetchExperienceFromAPI calls the LinkedIn Voyager API to get profile experience data.
189430
func (c *Client) fetchExperienceFromAPI(ctx context.Context, _ string, memberURN string) experienceData {
190431
// First, make a request to LinkedIn to get session cookies (JSESSIONID)

0 commit comments

Comments
 (0)