package main import ( "bufio" "bytes" "errors" "fmt" "io" "net/http" "os" "os/exec" "strings" "git.lukeshu.com/www/lib/httpcache" ) func _checkURL(url string) (string, error) { switch { case strings.HasPrefix(url, "https://web.archive.org/"): _, err := httpcache.Get(url, nil) return url, err case strings.HasPrefix(url, "https://www2.opengroup.org/ogsys/catalog/"): _, err := httpcache.Get(url, nil) if err == nil { return url, nil } if !errors.Is(err, os.ErrNotExist) { // don't hide non-404 errors return "", err } suffix := strings.TrimPrefix(url, "https://www2.opengroup.org/ogsys/catalog/") url2 := "https://publications.opengroup.org/" + strings.ToLower(suffix) _, err = httpcache.Get(url2, nil) if err == nil { return url2, nil } if !errors.Is(err, os.ErrNotExist) { // don't hide non-404 errors return "", err } url3, err := _checkURL("https://web.archive.org/web/20170102/" + url) if err == nil { return url3, nil } return url+"#ERROR", nil case url == "http://ieeexplore.ieee.org/servlet/opac?punumber=7394900": return url+"#ERROR", nil default: _, err := httpcache.Get(url, nil) if err != nil && errors.Is(err, os.ErrNotExist) { return _checkURL("https://web.archive.org/web/20170102/" + url) } return url, err } } func checkURL(url string) string { url2, err := _checkURL(url) if err != nil { panic(fmt.Errorf("URL=%q: %v", url, err)) } return url2 } func nokogiriIgnoreFailure(htmlBytes []byte, expr string) string { cmd := exec.Command("nokogiri", "-e", "puts "+expr) cmd.Stderr = io.Discard cmd.Stdin = bytes.NewReader(htmlBytes) outBytes, _ := cmd.Output() return strings.TrimSpace(string(outBytes)) } func mockRedirect(url string) *http.Response { resp, err := http.ReadResponse(bufio.NewReader(strings.NewReader(""+ "HTTP/1.1 302 Found\r\n"+ "Location: "+url+"\r\n"+ "\r\n")), nil) if err != nil { panic(err) } return resp } func mockForbidden() *http.Response { resp, err := http.ReadResponse(bufio.NewReader(strings.NewReader(""+ "HTTP/1.1 403 Forbidden\r\n"+ "\r\n")), nil) if err != nil { panic(err) } return resp } func modifyResponse(url string, entry httpcache.CacheEntry, resp *http.Response) *http.Response { switch { case strings.HasPrefix(url, "https://web.archive.org/"): htmlBytes, _ := io.ReadAll(resp.Body) _ = resp.Body.Close() // native Wayback Machine redirect redirect := nokogiriIgnoreFailure(htmlBytes, `$_.css("p.impatient a").first["href"]`) if strings.HasPrefix(redirect, "https://web.archive.org/web/") { return mockRedirect(redirect) } // silly TOG SSO if strings.Contains(url, "sso.opengroup.org") { if bytes.Contains(htmlBytes, []byte("document.forms.postbinding.submit()")) { redirect := nokogiriIgnoreFailure(htmlBytes, `$_.css("#postbinding").first["action"]`) if redirect != "" { return mockRedirect(redirect) } } if bytes.Contains(htmlBytes, []byte("General Authorization Error")) { return mockForbidden() } } // We drained resp.Body, so re-create it. resp, err := http.ReadResponse(bufio.NewReader(strings.NewReader(string(entry))), nil) if err != nil { panic(err) } return resp default: return resp } } type mock404 struct { Msg string } // Is implements the interface for [errors.Is]. func (e *mock404) Is(target error) bool { return target == os.ErrNotExist } // Error implements [error]. func (e *mock404) Error() string { return e.Msg } func checkRedirect(req *http.Request, via []*http.Request) error { // net/http.defaultCheckRedirect if len(via) >= 10 { return errors.New("stopped after 10 redirects") } // detect redirects that should be 404s oldURL := via[len(via)-1].URL newURL := req.URL if (newURL.Path == "/" || newURL.Path == "") && !(oldURL.Path == "/" || oldURL.Path == "") { return &mock404{Msg: fmt.Sprintf("should have been a 404: %q redirected to %q", oldURL.String(), newURL.String())} } return nil }