summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorLuke Shumaker <lukeshu@lukeshu.com>2017-06-30 20:52:56 -0400
committerLuke Shumaker <lukeshu@lukeshu.com>2017-06-30 20:52:56 -0400
commit99011e7fcebeccc26a3da591e3445a93ffadad3c (patch)
tree563f52145da316f27d243317ad727b376127aff3
parente46a74fe8a143936eee2b9be1fd6b5f963357d9d (diff)
more
-rw-r--r--Makefile24
-rw-r--r--bin/cdxcat0
-rw-r--r--bin/cdxcut0
3 files changed, 11 insertions, 13 deletions
diff --git a/Makefile b/Makefile
index eb8ae8e..262d3af 100644
--- a/Makefile
+++ b/Makefile
@@ -1,30 +1,28 @@
+SHELL=bash -o pipefail
PATH:=$(CURDIR)/bin:$(PATH)
export PATH
-all: each-cdx
+all: dat/index.txt
+
+fix:
+ grep -rl '<html><body><h1>503' dat | xargs rm -fv --
dat:
mkdir -p $@
-dat/ftp.txt: | dat
- cdxget 'url=ftp.unicode.org/Public/*' 'fl=urlkey' 'filter=statuscode:200' 'filter=urlkey:.*(cvt|convert)utf.*' > $@
-dat/www.txt: | dat
+dat/cdxindex.txt: | dat
cdxget 'url=www.unicode.org/Public/*' 'fl=urlkey' 'filter=statuscode:200' 'filter=urlkey:.*(cvt|convert)utf.*' > $@
-dat/urlkeys.txt: dat/ftp.txt dat/www.txt
- cat $^ | cut -d '?' -f1 | sed 's/,ftp)/)/' | sort -u > $@
+dat/urlkeys.txt: dat/cdxindex.txt
+ cat $^ | cut -d '?' -f1 | sort -u > $@
dat/urlkeys.mk: dat/urlkeys.txt
cat $^ | sed 's/^/urlkeys+=/' < $< > $@
-include dat/urlkeys.mk
-rp = )
-c = ,
-all_urlkeys = $(urlkeys) $(subst $(rp),$(c)ftp$(rp),$(urlkeys))
dat/each-cdx/%.txt:
- mkdir -p '$(@D)'
+ @mkdir -p '$(@D)'
cdxget "url=$$(urlkey2url '$*')" 'filter=statuscode:200' 'collapse=digest' 'fl=timestamp,original' > '$@'
-
-each-cdx: $(addprefix dat/each-cdx/,$(addsuffix .txt,$(all_urlkeys)))
-.PHONY: each-cdx
+dat/index.txt: $(addprefix dat/each-cdx/,$(addsuffix .txt,$(urlkeys)))
+ cat -- $(foreach c,$^,'$c') | sort > $@
.DELETE_ON_ERROR:
.SECONDARY:
diff --git a/bin/cdxcat b/bin/cdxcat
deleted file mode 100644
index e69de29..0000000
--- a/bin/cdxcat
+++ /dev/null
diff --git a/bin/cdxcut b/bin/cdxcut
deleted file mode 100644
index e69de29..0000000
--- a/bin/cdxcut
+++ /dev/null