正文
GO语言_用redis作为url队列的爬虫
小程序:扫一扫查出行
【扫一扫了解最新限行尾号】
复制小程序
【扫一扫了解最新限行尾号】
复制小程序
// Copyright 2016 laosj Author @songtianyi. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.package mainimport (
"github.com/songtianyi/laosj/downloader"
"github.com/songtianyi/laosj/spider"
"github.com/songtianyi/rrframework/connector/redis"
"github.com/songtianyi/rrframework/logs"
"github.com/songtianyi/rrframework/storage"
"regexp"
"strconv"
"sync"
)func main() {
d := &downloader.Downloader{
ConcurrencyLimit: ,
UrlChannelFactor: ,
RedisConnStr: "127.0.0.1:6379",
SourceQueue: "DATA:IMAGE:MZITU:XINGGAN",
Store: rrstorage.CreateLocalDiskStorage("/Users/deer_mac/Desktop/自拍/"),
}
go func() {
d.Start()
}() // step1: find total pages
s, err := spider.CreateSpiderFromUrl("http://www.mzitu.com/share")
if err != nil {
logs.Error(err)
return
}
rs, _ := s.GetText("div.main>div.main-content>div.postlist>div>div.pagenavi-cm>a")
max := spider.FindMaxFromSliceString(, rs) // step2: for every page, find all img tags
var wg sync.WaitGroup
var mu sync.Mutex
step2 := make([]string, )
for i := ; i <= max; i++ {
wg.Add()
go func(ix int) {
defer wg.Done()
ns, err := spider.CreateSpiderFromUrl(s.Url + "/comment-page-" + strconv.Itoa(ix) + "#comments/")
if err != nil {
logs.Error(err)
return
}
t, _ := ns.GetHtml("div.main>div.main-content>div.postlist>div>ul>li>div>p")
mu.Lock()
step2 = append(step2, t...)
mu.Unlock()
}(i)
}
wg.Wait()
err, rc := rrredis.GetRedisClient(d.RedisConnStr)
if err != nil {
logs.Error(err)
return
}
// parse url
for _, v := range step2 {
re := regexp.MustCompile("src=\"(\\S+)\"")
url := re.FindStringSubmatch(v)[]
key := d.SourceQueue
if _, err := rc.RPush(key, url); err != nil {
logs.Error(err)
return
}
}
d.WaitCloser()
}
首先要开启redis服务, 然后就可以了.