diff options
Diffstat (limited to 'cmd')
-rw-r--r-- | cmd/gen-posix/data.go | 7 | ||||
-rw-r--r-- | cmd/gen-posix/http_hacks.go | 156 | ||||
-rw-r--r-- | cmd/gen-posix/main.go | 4 |
3 files changed, 164 insertions, 3 deletions
diff --git a/cmd/gen-posix/data.go b/cmd/gen-posix/data.go index ac92095..165ecbd 100644 --- a/cmd/gen-posix/data.go +++ b/cmd/gen-posix/data.go @@ -18,7 +18,7 @@ var IEEESA = Vendor{ if err != nil { panic(fmt.Errorf("URL=%q: %v", url, err)) } - cmd := exec.Command("nokogiri", "-e", `puts $_.css("meta[name=\"des\"]").first["content"]`) + cmd := exec.Command("nokogiri", "-e", `puts $_.css("meta[name=\"des\"], meta[name=\"designation\"]").first["content"]`) cmd.Stderr = os.Stderr cmd.Stdin = strings.NewReader(html) d, err := cmd.Output() @@ -35,6 +35,9 @@ var IEEEXplore = Vendor{ Name: "IEEE Xplore", GetURL: func(id string) string { return fmt.Sprintf("http://ieeexplore.ieee.org/servlet/opac?punumber=%s", id) }, GetName: func(id string, url string) string { + if strings.HasSuffix(url, "ERROR") { + return "ERROR" + } html, err := httpcache.Get(url, nil) if err != nil { panic(fmt.Errorf("URL=%q: %v", url, err)) @@ -171,7 +174,7 @@ var Editions = []Edition{ }}, {Vendor: IEEESA, Type: Full, ID: "1003.1-2008", Resellers: []Document{ - {Vendor: IEEEXplore, Type: Full, ID: "7394900"}, + {Vendor: IEEEXplore, Type: Full, ID: "4694974"}, }}, {Vendor: ISO, Type: Full, ID: "50516", Resellers: []Document{ diff --git a/cmd/gen-posix/http_hacks.go b/cmd/gen-posix/http_hacks.go new file mode 100644 index 0000000..16b8a8d --- /dev/null +++ b/cmd/gen-posix/http_hacks.go @@ -0,0 +1,156 @@ +package main + +import ( + "bufio" + "bytes" + "errors" + "fmt" + "io" + "net/http" + "os" + "os/exec" + "strings" + + "git.lukeshu.com/www/lib/httpcache" +) + +func _checkURL(url string) (string, error) { + switch { + case strings.HasPrefix(url, "https://web.archive.org/"): + _, err := httpcache.Get(url, nil) + return url, err + case strings.HasPrefix(url, "https://www2.opengroup.org/ogsys/catalog/"): + _, err := httpcache.Get(url, nil) + if err == nil { + return url, nil + } + if !errors.Is(err, os.ErrNotExist) { // don't hide non-404 errors + return "", err + } + suffix := strings.TrimPrefix(url, "https://www2.opengroup.org/ogsys/catalog/") + url2 := "https://publications.opengroup.org/" + strings.ToLower(suffix) + _, err = httpcache.Get(url2, nil) + if err == nil { + return url2, nil + } + if !errors.Is(err, os.ErrNotExist) { // don't hide non-404 errors + return "", err + } + url3, err := _checkURL("https://web.archive.org/web/20170102/" + url) + if err == nil { + return url3, nil + } + return url+"#ERROR", nil + case url == "http://ieeexplore.ieee.org/servlet/opac?punumber=7394900": + return url+"#ERROR", nil + default: + _, err := httpcache.Get(url, nil) + if err != nil && errors.Is(err, os.ErrNotExist) { + return _checkURL("https://web.archive.org/web/20170102/" + url) + } + return url, err + } +} + +func checkURL(url string) string { + url2, err := _checkURL(url) + if err != nil { + panic(fmt.Errorf("URL=%q: %v", url, err)) + } + return url2 +} + +func nokogiriIgnoreFailure(htmlBytes []byte, expr string) string { + cmd := exec.Command("nokogiri", "-e", "puts "+expr) + cmd.Stderr = io.Discard + cmd.Stdin = bytes.NewReader(htmlBytes) + outBytes, _ := cmd.Output() + return strings.TrimSpace(string(outBytes)) +} + +func mockRedirect(url string) *http.Response { + resp, err := http.ReadResponse(bufio.NewReader(strings.NewReader(""+ + "HTTP/1.1 302 Found\r\n"+ + "Location: "+url+"\r\n"+ + "\r\n")), nil) + if err != nil { + panic(err) + } + return resp +} + +func mockForbidden() *http.Response { + resp, err := http.ReadResponse(bufio.NewReader(strings.NewReader(""+ + "HTTP/1.1 403 Forbidden\r\n"+ + "\r\n")), nil) + if err != nil { + panic(err) + } + return resp +} + +func modifyResponse(url string, entry httpcache.CacheEntry, resp *http.Response) *http.Response { + switch { + case strings.HasPrefix(url, "https://web.archive.org/"): + htmlBytes, _ := io.ReadAll(resp.Body) + _ = resp.Body.Close() + + // native Wayback Machine redirect + redirect := nokogiriIgnoreFailure(htmlBytes, `$_.css("p.impatient a").first["href"]`) + if strings.HasPrefix(redirect, "https://web.archive.org/web/") { + return mockRedirect(redirect) + } + + // silly TOG SSO + if strings.Contains(url, "sso.opengroup.org") { + if bytes.Contains(htmlBytes, []byte("document.forms.postbinding.submit()")) { + redirect := nokogiriIgnoreFailure(htmlBytes, `$_.css("#postbinding").first["action"]`) + if redirect != "" { + return mockRedirect(redirect) + } + } + if bytes.Contains(htmlBytes, []byte("General Authorization Error")) { + return mockForbidden() + } + } + + // We drained resp.Body, so re-create it. + resp, err := http.ReadResponse(bufio.NewReader(strings.NewReader(string(entry))), nil) + if err != nil { + panic(err) + } + return resp + default: + return resp + } +} + +type mock404 struct { + Msg string +} + +// Is implements the interface for [errors.Is]. +func (e *mock404) Is(target error) bool { + return target == os.ErrNotExist +} + +// Error implements [error]. +func (e *mock404) Error() string { + return e.Msg +} + +func checkRedirect(req *http.Request, via []*http.Request) error { + // net/http.defaultCheckRedirect + if len(via) >= 10 { + return errors.New("stopped after 10 redirects") + } + + // detect redirects that should be 404s + oldURL := via[len(via)-1].URL + newURL := req.URL + if (newURL.Path == "/" || newURL.Path == "") && !(oldURL.Path == "/" || oldURL.Path == "") { + return &mock404{Msg: fmt.Sprintf("should have been a 404: %q redirected to %q", oldURL.String(), newURL.String())} + } + + return nil +} diff --git a/cmd/gen-posix/main.go b/cmd/gen-posix/main.go index 7525719..6da598b 100644 --- a/cmd/gen-posix/main.go +++ b/cmd/gen-posix/main.go @@ -18,7 +18,7 @@ func (doc Document) URL() string { } key := doc.Vendor.Name + "\000" + doc.ID if _, have := urls[key]; !have { - urls[key] = doc.Vendor.GetURL(doc.ID) + urls[key] = checkURL(doc.Vendor.GetURL(doc.ID)) } return urls[key] } @@ -185,6 +185,8 @@ var tmpl = `{{define "document"}}{{if .}} func mainWithError() error { httpcache.UserAgent = "https://git.lukeshu.com/www/tree/cmd/gen-posix" + httpcache.ModifyResponse = modifyResponse + httpcache.CheckRedirect = checkRedirect tmpl := template.Must(template.New("page").Parse(tmpl)) |