summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorLuke T. Shumaker <lukeshu@lukeshu.com>2025-05-20 23:22:30 -0400
committerLuke T. Shumaker <lukeshu@lukeshu.com>2025-05-21 01:16:46 -0400
commitd8bcbe893b909276cb51004204da708121c9dd22 (patch)
tree26f59d37cad13292d1bcb7ee7f615f5a1335da1d
parentb51fb9122deced304bf316d7f8ac3309324e35f7 (diff)
gen-posix: Use the cache, check errors, get it working like it wasHEADmain
as much as I can, anyway
-rw-r--r--cmd/gen-posix/data.go7
-rw-r--r--cmd/gen-posix/http_hacks.go156
-rw-r--r--cmd/gen-posix/main.go4
3 files changed, 164 insertions, 3 deletions
diff --git a/cmd/gen-posix/data.go b/cmd/gen-posix/data.go
index ac92095..165ecbd 100644
--- a/cmd/gen-posix/data.go
+++ b/cmd/gen-posix/data.go
@@ -18,7 +18,7 @@ var IEEESA = Vendor{
if err != nil {
panic(fmt.Errorf("URL=%q: %v", url, err))
}
- cmd := exec.Command("nokogiri", "-e", `puts $_.css("meta[name=\"des\"]").first["content"]`)
+ cmd := exec.Command("nokogiri", "-e", `puts $_.css("meta[name=\"des\"], meta[name=\"designation\"]").first["content"]`)
cmd.Stderr = os.Stderr
cmd.Stdin = strings.NewReader(html)
d, err := cmd.Output()
@@ -35,6 +35,9 @@ var IEEEXplore = Vendor{
Name: "IEEE Xplore",
GetURL: func(id string) string { return fmt.Sprintf("http://ieeexplore.ieee.org/servlet/opac?punumber=%s", id) },
GetName: func(id string, url string) string {
+ if strings.HasSuffix(url, "ERROR") {
+ return "ERROR"
+ }
html, err := httpcache.Get(url, nil)
if err != nil {
panic(fmt.Errorf("URL=%q: %v", url, err))
@@ -171,7 +174,7 @@ var Editions = []Edition{
}},
{Vendor: IEEESA, Type: Full, ID: "1003.1-2008", Resellers: []Document{
- {Vendor: IEEEXplore, Type: Full, ID: "7394900"},
+ {Vendor: IEEEXplore, Type: Full, ID: "4694974"},
}},
{Vendor: ISO, Type: Full, ID: "50516", Resellers: []Document{
diff --git a/cmd/gen-posix/http_hacks.go b/cmd/gen-posix/http_hacks.go
new file mode 100644
index 0000000..16b8a8d
--- /dev/null
+++ b/cmd/gen-posix/http_hacks.go
@@ -0,0 +1,156 @@
+package main
+
+import (
+ "bufio"
+ "bytes"
+ "errors"
+ "fmt"
+ "io"
+ "net/http"
+ "os"
+ "os/exec"
+ "strings"
+
+ "git.lukeshu.com/www/lib/httpcache"
+)
+
+func _checkURL(url string) (string, error) {
+ switch {
+ case strings.HasPrefix(url, "https://web.archive.org/"):
+ _, err := httpcache.Get(url, nil)
+ return url, err
+ case strings.HasPrefix(url, "https://www2.opengroup.org/ogsys/catalog/"):
+ _, err := httpcache.Get(url, nil)
+ if err == nil {
+ return url, nil
+ }
+ if !errors.Is(err, os.ErrNotExist) { // don't hide non-404 errors
+ return "", err
+ }
+ suffix := strings.TrimPrefix(url, "https://www2.opengroup.org/ogsys/catalog/")
+ url2 := "https://publications.opengroup.org/" + strings.ToLower(suffix)
+ _, err = httpcache.Get(url2, nil)
+ if err == nil {
+ return url2, nil
+ }
+ if !errors.Is(err, os.ErrNotExist) { // don't hide non-404 errors
+ return "", err
+ }
+ url3, err := _checkURL("https://web.archive.org/web/20170102/" + url)
+ if err == nil {
+ return url3, nil
+ }
+ return url+"#ERROR", nil
+ case url == "http://ieeexplore.ieee.org/servlet/opac?punumber=7394900":
+ return url+"#ERROR", nil
+ default:
+ _, err := httpcache.Get(url, nil)
+ if err != nil && errors.Is(err, os.ErrNotExist) {
+ return _checkURL("https://web.archive.org/web/20170102/" + url)
+ }
+ return url, err
+ }
+}
+
+func checkURL(url string) string {
+ url2, err := _checkURL(url)
+ if err != nil {
+ panic(fmt.Errorf("URL=%q: %v", url, err))
+ }
+ return url2
+}
+
+func nokogiriIgnoreFailure(htmlBytes []byte, expr string) string {
+ cmd := exec.Command("nokogiri", "-e", "puts "+expr)
+ cmd.Stderr = io.Discard
+ cmd.Stdin = bytes.NewReader(htmlBytes)
+ outBytes, _ := cmd.Output()
+ return strings.TrimSpace(string(outBytes))
+}
+
+func mockRedirect(url string) *http.Response {
+ resp, err := http.ReadResponse(bufio.NewReader(strings.NewReader(""+
+ "HTTP/1.1 302 Found\r\n"+
+ "Location: "+url+"\r\n"+
+ "\r\n")), nil)
+ if err != nil {
+ panic(err)
+ }
+ return resp
+}
+
+func mockForbidden() *http.Response {
+ resp, err := http.ReadResponse(bufio.NewReader(strings.NewReader(""+
+ "HTTP/1.1 403 Forbidden\r\n"+
+ "\r\n")), nil)
+ if err != nil {
+ panic(err)
+ }
+ return resp
+}
+
+func modifyResponse(url string, entry httpcache.CacheEntry, resp *http.Response) *http.Response {
+ switch {
+ case strings.HasPrefix(url, "https://web.archive.org/"):
+ htmlBytes, _ := io.ReadAll(resp.Body)
+ _ = resp.Body.Close()
+
+ // native Wayback Machine redirect
+ redirect := nokogiriIgnoreFailure(htmlBytes, `$_.css("p.impatient a").first["href"]`)
+ if strings.HasPrefix(redirect, "https://web.archive.org/web/") {
+ return mockRedirect(redirect)
+ }
+
+ // silly TOG SSO
+ if strings.Contains(url, "sso.opengroup.org") {
+ if bytes.Contains(htmlBytes, []byte("document.forms.postbinding.submit()")) {
+ redirect := nokogiriIgnoreFailure(htmlBytes, `$_.css("#postbinding").first["action"]`)
+ if redirect != "" {
+ return mockRedirect(redirect)
+ }
+ }
+ if bytes.Contains(htmlBytes, []byte("General Authorization Error")) {
+ return mockForbidden()
+ }
+ }
+
+ // We drained resp.Body, so re-create it.
+ resp, err := http.ReadResponse(bufio.NewReader(strings.NewReader(string(entry))), nil)
+ if err != nil {
+ panic(err)
+ }
+ return resp
+ default:
+ return resp
+ }
+}
+
+type mock404 struct {
+ Msg string
+}
+
+// Is implements the interface for [errors.Is].
+func (e *mock404) Is(target error) bool {
+ return target == os.ErrNotExist
+}
+
+// Error implements [error].
+func (e *mock404) Error() string {
+ return e.Msg
+}
+
+func checkRedirect(req *http.Request, via []*http.Request) error {
+ // net/http.defaultCheckRedirect
+ if len(via) >= 10 {
+ return errors.New("stopped after 10 redirects")
+ }
+
+ // detect redirects that should be 404s
+ oldURL := via[len(via)-1].URL
+ newURL := req.URL
+ if (newURL.Path == "/" || newURL.Path == "") && !(oldURL.Path == "/" || oldURL.Path == "") {
+ return &mock404{Msg: fmt.Sprintf("should have been a 404: %q redirected to %q", oldURL.String(), newURL.String())}
+ }
+
+ return nil
+}
diff --git a/cmd/gen-posix/main.go b/cmd/gen-posix/main.go
index 7525719..6da598b 100644
--- a/cmd/gen-posix/main.go
+++ b/cmd/gen-posix/main.go
@@ -18,7 +18,7 @@ func (doc Document) URL() string {
}
key := doc.Vendor.Name + "\000" + doc.ID
if _, have := urls[key]; !have {
- urls[key] = doc.Vendor.GetURL(doc.ID)
+ urls[key] = checkURL(doc.Vendor.GetURL(doc.ID))
}
return urls[key]
}
@@ -185,6 +185,8 @@ var tmpl = `{{define "document"}}{{if .}}
func mainWithError() error {
httpcache.UserAgent = "https://git.lukeshu.com/www/tree/cmd/gen-posix"
+ httpcache.ModifyResponse = modifyResponse
+ httpcache.CheckRedirect = checkRedirect
tmpl := template.Must(template.New("page").Parse(tmpl))