新月実装開発部

afd30e3f anonymous 2015-12-12 15:29
@markdown
クローラがようやくちゃんと動くようになりました。
/getをうまく使わないとすぐタイムアウトになっちゃいまね、これ。
/headでレスの欠損を調べて、一括でダウンロードできなかったスレは
レコードを個別にダウンロードしています。
```clojure
(defn get-files-with-recent-command []
  (let [records (clojure.string/split (apply str (pmap #(recent %1 "0-") @active-nodes)) #"\n")
        records (remove #(not (re-find #"^[0-9]+<>[0-9a-f]{32}<>thread_[0-9A-F]+(<>.*)?$" %)) records)
        file-names (map #(second (re-find #"^[0-9]+<>[0-9a-f]{32}<>(thread_[0-9A-F]+)(<>.*)?$" %)) records)
        file-names (clojure.set/difference (into #{} file-names) known-corrupt-files)]
    file-names))

(defn download-thread-from-node
  ([node-name file-name]
   (download-thread-from-node node-name file-name "0-"))

  ([node-name file-name range]
   (timbre/debug "download-thread-from-node:" node-name file-name range)
   (if-not (valid-node-name? node-name)
     (throw (IllegalArgumentException. "Invalid node name.")))
   (if-not (valid-file-name? file-name)
     (throw (IllegalArgumentException. "Invalid file name.")))
   (if-not (valid-range? range)
     (throw (IllegalArgumentException. "Invalid range.")))

   (try
     (let [file-id (db/get-file-id file-name)
           existing-records (and file-id (db/get-all-records-in-file-without-bodies file-id))]
       (if (and (= range "0-")
                existing-records
                (pos? (count existing-records)))
         ; Use /head to find missing records.
         (let [file (:body (client/get (str "http://" node-name "/head/" file-name "/" range) http-params))
               file (clojure.string/replace file #"(?m)^(?![0-9]+<>[0-9a-f]{32}).*$" "")
               file (clojure.string/replace file #"\r" "")
               file (clojure.string/replace file #"\n+" "\n")
               records (remove #(zero? (count %)) (clojure.string/split-lines file))
               records (map #(let [match (re-find #"^([0-9]+)<>([0-9a-f]{32})" %)]
                              {:stamp (Integer/parseInt (nth match 1)) :record-id (nth match 2)})
                              records)
               existing-records (map #(identity {:stamp (:stamp %) :record-id (:record-id %)}) existing-records)
               records (clojure.set/difference (into #{} records) (into #{} existing-records))]
           (if (empty? records)
             0
             (let [stamps (map :stamp records)
                   oldest (apply min stamps)
                   newest (apply max stamps)]
               (download-thread-from-node node-name file-name (str oldest "-" newest))
               (let [existing-records (map #(identity {:stamp (:stamp %) :record-id (:record-id %)}) existing-records)
                     records (clojure.set/difference (into #{} records) (into #{} existing-records))
                     stamps (map :stamp records)]
                 (dorun (map #(download-thread-from-node node-name file-name (str %)) stamps))))))

         ; Use the supplied range.
         (let [file (:body (client/get (str "http://" node-name "/get/" file-name "/" range) http-params))
               file (clojure.string/replace file #"(?m)^(?![0-9]+<>[0-9a-f]{32}<>).*$" "")
               file (clojure.string/replace file #"\r" "")
               file (clojure.string/replace file #"\n+" "\n")
               records (remove #(zero? (count %)) (clojure.string/split-lines file))]
           (dorun
             (pmap
               #(try
                 (let [match (re-find #"^([0-9]+)<>([0-9a-f]{32})<>(.*)$" %)
                       stamp (nth match 1)
                       record-id (nth match 2)
                       body (nth match 3)]
                   (db/add-record file-id stamp record-id body))
                 (catch Throwable _ (timbre/debug (str "download-thread-from-node: Record skipped: " %))))
               records))
           ;(if-not (valid-file? file)
           ;  (throw (Exception. "Invalid file.")))
           (count records))))
         (catch Exception e
           (timbre/error e)
           nil))))

(defn download-thread-from-all-active-nodes
  ([file-name]
   (download-thread-from-all-active-nodes file-name "0-"))

  ([file-name range]
   (if-not (valid-file-name? file-name)
     (throw (IllegalArgumentException. "Invalid file name.")))
   (if-not (valid-range? range)
     (throw (IllegalArgumentException. "Invalid range.")))
   (dorun
     (map
       #(download-thread-from-node % file-name range)
       (shuffle @active-nodes)))
   true))

(defn crawl-node [node-name]
  (timbre/debug "crawl-node:" node-name)
  (try
    (dorun
      (map
        #(if (some #{ node-name } @active-nodes)
          (download-thread-from-node node-name %))
        (shuffle (map :file-name (db/get-all-files)))))
    (catch Throwable t
      (timbre/error t)
      nil)))

(defn crawl-nodes []
  (timbre/debug "crawl-nodes")
  (try
    (comment let [file-names (get-files-with-recent-command)]
      (dorun (pmap #(db/add-file %) file-names)))
    (dorun
      (pmap crawl-node (shuffle @active-nodes)))
    (catch Throwable t
      (timbre/error t)
      nil)))
```

Top of this page. | <<last 0 1 2

limit: 15360KB

(新月実装開発部/255/0.8MB)


Powered by shinGETsu.