summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--.gitignore4
-rw-r--r--Makefile25
-rwxr-xr-xcdx_json7
-rwxr-xr-xurlkey2url8
4 files changed, 44 insertions, 0 deletions
diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..745d6d4
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,4 @@
+*.json
+*/
+*.mk
+*.txt
diff --git a/Makefile b/Makefile
new file mode 100644
index 0000000..14db651
--- /dev/null
+++ b/Makefile
@@ -0,0 +1,25 @@
+all: each-cdx
+
+ftp.json: cdx_json
+ ./cdx_json 'url=ftp.unicode.org/Public/*' 'collapse=urlkey' > $@
+www.json: cdx_json
+ ./cdx_json 'url=www.unicode.org/Public/*' 'collapse=urlkey' > $@
+urlkeys.txt: ftp.json www.json Makefile
+ cat $^ | cut -d '"' -f2,10 | sed -n 's/"200$$//p' | cut -d '?' -f1 | sed 's/,ftp)/)/' | sort -u | grep -i -e cvtutf -e convertutf > $@
+urlkeys.mk: urlkeys.txt
+ sed 's/^/urlkeys+=/' < $< > $@
+
+-include urlkeys.mk
+rp = )
+c = ,
+all_urlkeys = $(urlkeys) $(subst $(rp),$(c)ftp$(rp),$(urlkeys))
+
+each-cdx/%.json: cdx_json urlkey2url
+ mkdir -p '$(@D)'
+ ./cdx_json "url=$$(./urlkey2url '$*')" 'collapse=digest' > '$@'
+
+each-cdx: $(addprefix each-cdx/,$(addsuffix .json,$(all_urlkeys)))
+.PHONY: each-cdx
+
+.DELETE_ON_ERROR:
+.SECONDARY:
diff --git a/cdx_json b/cdx_json
new file mode 100755
index 0000000..81284af
--- /dev/null
+++ b/cdx_json
@@ -0,0 +1,7 @@
+#!/usr/bin/env bash
+
+url='http://web.archive.org/cdx/search/cdx?output=json'
+for arg in "$@"; do
+ url+="&${arg%%=*}=$(printf '%s' "${arg#*=}"|urlencode)"
+done
+curl -s "$url"
diff --git a/urlkey2url b/urlkey2url
new file mode 100755
index 0000000..5d0ec3d
--- /dev/null
+++ b/urlkey2url
@@ -0,0 +1,8 @@
+#!/usr/bin/env bash
+
+for arg in "$@"; do
+ keydomain="${arg%%)*}"
+ keypath="${arg#*)}"
+ domain="$(IFS=,; printf '%s\n' $keydomain|tac|xargs|tr ' ' '.')"
+ echo "$domain$keypath"
+done