-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathgetbook.rb
executable file
·149 lines (133 loc) · 4.07 KB
/
getbook.rb
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
#!/usr/bin/env ruby
# coding: utf-8
#get book from http://oreno-erohon.com
require 'yaml'
require 'ostruct'
require 'optparse'
require 'nokogiri'
require 'cgi'
load File.expand_path('../wget.rb', __FILE__)
$opts = OpenStruct.new
def get_book(url, opts)
wget = MyWGet.new(opts.fast ? nil : 1, opts.retry)
cache = 'main-page.html'
if not File.exist? cache
wget.get(url, cache)
end
info = OpenStruct.new
doc = Nokogiri::HTML(File.open(cache, 'r:utf-8'))
info.url = url
info.title = doc.title
if opts.tags_xpath
info.tags = doc.xpath(opts.tags_xpath).map{ |e| e.text }
else
info.tags = [ ]
end
info.img_files = doc.xpath(opts.xpath).map{ |e| opts.imgpath.call(e) }
puts 'get tags failed' if info.tags.empty?
File.open("info.yaml", 'w:utf-8').write(YAML.dump(info.to_h))
return if $opts.up_info
img_num = info.img_files.size
info.img_files.each.with_index do |url, idx|
fname = "%03d_%s" % [ idx, opts.basename.call(url) ]
if File.exist? fname
puts "skip #{fname}"
else
puts "fetching %d/%d %s" % [ idx + 1, img_num, url ]
wget.get(url, fname)
end
end
File.open("done.txt", 'w').write("done flag")
end
def get_from(url, opts)
opts.imgpath = proc { |e| e['src'] }
opts.basename = proc { |u| File.basename(u) }
if url =~ /oreno-erohon.com/
opts.xpath ||= '//div[@id="main"]/article/div/section/img'
opts.tags_xpath ||= '//div[@class="article-tags"][1]/ul/li/a'
elsif url =~ /eromanga-collector.com/
opts.xpath ||= '//div[@id="main"]/article/div/section/img'
opts.tags_xpath ||= '//table[@class="article-all-taxs"][1]/tr[4]/td/ul/li/a'
elsif url =~ /xn--qexm24f3mc.xyz/
opts.xpath ||= '//div[@id="contentimg"]/ul/li/img'
opts.basename = proc { |u| File.basename(CGI.unescape(u)) }
opts.imgpath = proc { |e| e['data-original'] }
else
fail "unsupported site: #{url}"
end
get_book(url, opts)
end
def create_dir(dir, url)
dir.succ! while Dir.exist? dir
puts "mkdir #{dir} for #{url}"
Dir.mkdir dir
dir
end
def check_duplicate(url)
Dir.glob('books/book-*/info.yaml').each do |yaml|
info = YAML.load(File.open(yaml).read)
return File.dirname(yaml) if url == info[:url]
end
false
end
def check_download(url)
dup = check_duplicate(url)
if dup and not $opts.continue
puts "already got it in #{dup}"
return
end
if $opts.no_download and not dup
puts "missing book: #{url}"
return
end
if $opts.continue
if not dup and not $opts.list
puts "continue on book which not exist."
return
end
if dup and File.exist? File.join(dup, 'done.txt')
puts "skip book #{dup} which is done"
return
end
puts "continue download #{url} in #{dup}" if dup
end
pwd = Dir.pwd
dir = dup ? dup : create_dir($opts.dir, url)
Dir.chdir dir
get_from(url, $opts)
Dir.chdir pwd
end
# main
$opts.dir = 'books/book-t-001'
$opts.retry = 3
OptionParser.new do |op|
op.banner = 'getbook.rb [options] url'
op.on('--url URL', 'get from url') { |u| $opts.url = u }
op.on('--update-info', 'only update info file') { $opts.up_info = true }
op.on('--dir DIR', 'get book in DIR') { |d| $opts.dir = d }
op.on('--curdir', 'get book in current dir') { $opts.dir = nil }
op.on('-C', '--continue', 'continue on exist book') { $opts.continue = true }
op.on('-F', '--fast', 'fast mode') { $opts.fast = true }
op.on('-R', '--retry N', 'retry times') { |n| $opts.retry = n.to_i }
op.on('-l', '--list LIST', 'get from list') { |l| $opts.list = l }
op.on('-x', '--xpath xpath', 'img from xpath') { |x| $opts.xpath = x }
op.on('-t', '--tags-xpath xt', 'tags xpath') { |x| $opts.tags_xpath = x }
op.on('-n', '--no-download', 'only check') { $opts.no_download = true }
end.parse!
if $opts.up_info and not $opts.url
puts 'read info.yaml'
info = YAML.load(File.open('info.yaml').read)
$opts.url = info[:url]
end
if $opts.list
# read from list
IO.readlines($opts.list).each do |url|
if url =~ /(http\S+)/
check_download($1)
end
end
else
fail "need url" if not $opts.url and ARGV.empty?
$opts.url ||= ARGV[0]
check_download($opts.url)
end