-
43404a01 :Anonymous
2015-12-12 11:31
-
@markdown
どうやらクローラーができたっぽい。それでは早速ポチッとな。
```clojure
(defn get-files-with-recent-command []
(let [records (clojure.string/split (apply str (pmap #(recent %1 "0-") @active-nodes)) #"\n")
records (remove #(not (re-find #"^[0-9]+<>[0-9a-f]{32}<>thread_[0-9A-F]+(<>.*)?$" %)) records)
file-names (map #(second (re-find #"^[0-9]+<>[0-9a-f]{32}<>(thread_[0-9A-F]+)(<>.*)?$" %)) records)
file-names (clojure.set/difference (into #{} file-names) known-corrupt-files)]
file-names))
(defn download-thread-from-node
([node-name file-name]
(download-thread-from-node node-name file-name "0-"))
([node-name file-name range]
(timbre/debug "download-thread-from-node:" node-name file-name range)
(if-not (valid-node-name? node-name)
(throw (IllegalArgumentException. "Invalid node name.")))
(if-not (valid-file-name? file-name)
(throw (IllegalArgumentException. "Invalid file name.")))
(if-not (valid-range? range)
(throw (IllegalArgumentException. "Invalid range.")))
(try
(let [file-id (db/get-file-id file-name)
existing-records (and file-id (db/get-all-records-in-file-without-bodies file-id))]
(if (and (= range "0-")
existing-records
(pos? (count existing-records)))
; Use /head to find missing records.
(let [file (:body (client/get (str "http://" node-name "/head/" file-name "/" range) http-params))
file (clojure.string/replace file #"(?m)^(?![0-9]+<>[0-9a-f]{32}).*$" "")
file (clojure.string/replace file #"\r" "")
file (clojure.string/replace file #"\n+" "\n")
records (remove #(zero? (count %)) (clojure.string/split-lines file))
records (map #(let [match (re-find #"^([0-9]+)<>([0-9a-f]{32})" %)]
{:stamp (Integer/parseInt (nth match 1)) :record-id (nth match 2)})
records)
existing-records (map #(identity {:stamp (:stamp %) :record-id (:record-id %)}) existing-records)
records (clojure.set/difference (into #{} records) (into #{} existing-records))]
(if (empty? records)
0
(let [stamps (map :stamp records)
oldest (apply min stamps)
newest (apply max stamps)]
(download-thread-from-node node-name file-name (str oldest "-" newest)))))
; Use the supplied range.
(let [file (:body (client/get (str "http://" node-name "/get/" file-name "/" range) http-params))
file (clojure.string/replace file #"(?m)^(?![0-9]+<>[0-9a-f]{32}<>).*$" "")
file (clojure.string/replace file #"\r" "")
file (clojure.string/replace file #"\n+" "\n")
records (remove #(zero? (count %)) (clojure.string/split-lines file))]
(dorun
(pmap
#(try
(let [match (re-find #"^([0-9]+)<>([0-9a-f]{32})<>(.*)$" %)
stamp (nth match 1)
record-id (nth match 2)
body (nth match 3)]
(db/add-record file-id stamp record-id body))
(catch Throwable _ (timbre/debug (str "download-thread-from-node: Record skipped: " %))))
records))
;(if-not (valid-file? file)
; (throw (Exception. "Invalid file.")))
(count records))))
(catch Exception e
(timbre/error e)
nil))))
(defn download-thread-from-all-active-nodes
([file-name]
(download-thread-from-all-active-nodes file-name "0-"))
([file-name range]
(if-not (valid-file-name? file-name)
(throw (IllegalArgumentException. "Invalid file name.")))
(if-not (valid-range? range)
(throw (IllegalArgumentException. "Invalid range.")))
(dorun
(map
#(download-thread-from-node % file-name range)
(shuffle @active-nodes)))
true))
(defn crawl-nodes []
(timbre/debug "crawl-nodes")
(try
(let [file-names (get-files-with-recent-command)]
(dorun (pmap #(db/add-file %) file-names)))
(dorun
(pmap
download-thread-from-all-active-nodes
(map :file-name (db/get-all-files))))
(catch Throwable t
(timbre/error t)
nil)))
```
Powered by shinGETsu.