From e46a74fe8a143936eee2b9be1fd6b5f963357d9d Mon Sep 17 00:00:00 2001 From: Luke Shumaker Date: Fri, 30 Jun 2017 20:05:23 -0400 Subject: work on it --- .gitignore | 5 +---- Makefile | 29 +++++++++++++++++------------ bin/cdxcat | 0 bin/cdxcut | 0 bin/cdxget | 7 +++++++ bin/urlkey2url | 8 ++++++++ cdx_json | 7 ------- urlkey2url | 8 -------- 8 files changed, 33 insertions(+), 31 deletions(-) create mode 100644 bin/cdxcat create mode 100644 bin/cdxcut create mode 100755 bin/cdxget create mode 100755 bin/urlkey2url delete mode 100755 cdx_json delete mode 100755 urlkey2url diff --git a/.gitignore b/.gitignore index 745d6d4..a15fceb 100644 --- a/.gitignore +++ b/.gitignore @@ -1,4 +1 @@ -*.json -*/ -*.mk -*.txt +/dat/ diff --git a/Makefile b/Makefile index 14db651..eb8ae8e 100644 --- a/Makefile +++ b/Makefile @@ -1,24 +1,29 @@ +PATH:=$(CURDIR)/bin:$(PATH) +export PATH + all: each-cdx -ftp.json: cdx_json - ./cdx_json 'url=ftp.unicode.org/Public/*' 'collapse=urlkey' > $@ -www.json: cdx_json - ./cdx_json 'url=www.unicode.org/Public/*' 'collapse=urlkey' > $@ -urlkeys.txt: ftp.json www.json Makefile - cat $^ | cut -d '"' -f2,10 | sed -n 's/"200$$//p' | cut -d '?' -f1 | sed 's/,ftp)/)/' | sort -u | grep -i -e cvtutf -e convertutf > $@ -urlkeys.mk: urlkeys.txt - sed 's/^/urlkeys+=/' < $< > $@ +dat: + mkdir -p $@ +dat/ftp.txt: | dat + cdxget 'url=ftp.unicode.org/Public/*' 'fl=urlkey' 'filter=statuscode:200' 'filter=urlkey:.*(cvt|convert)utf.*' > $@ +dat/www.txt: | dat + cdxget 'url=www.unicode.org/Public/*' 'fl=urlkey' 'filter=statuscode:200' 'filter=urlkey:.*(cvt|convert)utf.*' > $@ +dat/urlkeys.txt: dat/ftp.txt dat/www.txt + cat $^ | cut -d '?' -f1 | sed 's/,ftp)/)/' | sort -u > $@ +dat/urlkeys.mk: dat/urlkeys.txt + cat $^ | sed 's/^/urlkeys+=/' < $< > $@ --include urlkeys.mk +-include dat/urlkeys.mk rp = ) c = , all_urlkeys = $(urlkeys) $(subst $(rp),$(c)ftp$(rp),$(urlkeys)) -each-cdx/%.json: cdx_json urlkey2url +dat/each-cdx/%.txt: mkdir -p '$(@D)' - ./cdx_json "url=$$(./urlkey2url '$*')" 'collapse=digest' > '$@' + cdxget "url=$$(urlkey2url '$*')" 'filter=statuscode:200' 'collapse=digest' 'fl=timestamp,original' > '$@' -each-cdx: $(addprefix each-cdx/,$(addsuffix .json,$(all_urlkeys))) +each-cdx: $(addprefix dat/each-cdx/,$(addsuffix .txt,$(all_urlkeys))) .PHONY: each-cdx .DELETE_ON_ERROR: diff --git a/bin/cdxcat b/bin/cdxcat new file mode 100644 index 0000000..e69de29 diff --git a/bin/cdxcut b/bin/cdxcut new file mode 100644 index 0000000..e69de29 diff --git a/bin/cdxget b/bin/cdxget new file mode 100755 index 0000000..a54612d --- /dev/null +++ b/bin/cdxget @@ -0,0 +1,7 @@ +#!/usr/bin/env bash + +url='http://web.archive.org/cdx/search/cdx?' +for arg in "$@"; do + url+="$s${arg%%=*}=$(printf '%s' "${arg#*=}"|urlencode)&" +done +curl -s "$url" diff --git a/bin/urlkey2url b/bin/urlkey2url new file mode 100755 index 0000000..5d0ec3d --- /dev/null +++ b/bin/urlkey2url @@ -0,0 +1,8 @@ +#!/usr/bin/env bash + +for arg in "$@"; do + keydomain="${arg%%)*}" + keypath="${arg#*)}" + domain="$(IFS=,; printf '%s\n' $keydomain|tac|xargs|tr ' ' '.')" + echo "$domain$keypath" +done diff --git a/cdx_json b/cdx_json deleted file mode 100755 index 81284af..0000000 --- a/cdx_json +++ /dev/null @@ -1,7 +0,0 @@ -#!/usr/bin/env bash - -url='http://web.archive.org/cdx/search/cdx?output=json' -for arg in "$@"; do - url+="&${arg%%=*}=$(printf '%s' "${arg#*=}"|urlencode)" -done -curl -s "$url" diff --git a/urlkey2url b/urlkey2url deleted file mode 100755 index 5d0ec3d..0000000 --- a/urlkey2url +++ /dev/null @@ -1,8 +0,0 @@ -#!/usr/bin/env bash - -for arg in "$@"; do - keydomain="${arg%%)*}" - keypath="${arg#*)}" - domain="$(IFS=,; printf '%s\n' $keydomain|tac|xargs|tr ' ' '.')" - echo "$domain$keypath" -done -- cgit v1.1-4-g5e80