作者:找唐娃娃_622 | 来源:互联网 | 2023-10-12 17:57
仅供参考和学习,请适度轻爬爬取第一PPT导航栏中的除了后三个的所有ppt并分类文件夹保存如下图windows64二进制程序mac二进制程序如想自己尝试(步骤如下支持
仅供参考和学习,请适度轻爬
爬取第一PPT导航栏中的除了后三个的所有ppt并分类文件夹保存如下图
windows64二进制程序
mac二进制程序
git clone https://github.com/xhaoxiong/fetch_ppt
go build
./FetchPPT FetchPPT.exe
/*** @Author xiaoxiao* @Description CREATE FILE collector* @Date 2020/10/10 10:29 上午**/
package collectorimport ("FetchPPT/util""bytes""fmt""github.com/gocolly/colly/v2""io""log""os""path""strconv""strings""sync"
)const (originUrl = "http://www.di1ppt.com"downloadOriginUrl = "http://www.di1ppt.com/e/DownSys"
)type SeedConfig struct {DownloadFile DownloadFileGetNavCollector func(file DownloadFile)
}type DownloadFile struct {Filename stringUrl stringDirectory stringOriginDirectory string
}type CrawCollector struct {DownloadFile DownloadFileWg *sync.WaitGroup
}var filterMap = map[string]bool{"/office/": true,"/ziti/": true,"http://www.10103.com": true,
}var page_list = make(map[string]bool)func Run() {SeedConfig := SeedConfig{DownloadFile: DownloadFile{Filename: "",Url: "",Directory: "",},GetNavCollector: GetNavCollector,}SeedConfig.Run()
}func (s *SeedConfig) Run() {s.GetNavCollector(s.DownloadFile)
}var Wg = &sync.WaitGroup{}//获取导航页面
func GetNavCollector(downloadFile DownloadFile) {c := colly.NewCollector()c.OnHTML("#navMenu li", func(element *colly.HTMLElement) {cc := &CrawCollector{DownloadFile: downloadFile,}seedUrl := element.ChildAttr("a", "href")if !filterMap[seedUrl] {directory := element.ChildText("a>span")dir := path.Join(downloadFile.OriginDirectory, directory)if !util.Exists(dir) {os.MkdirAll(dir, 0777)}cc.DownloadFile.Directory = dirWg.Add(1)cc.GetDetailCollector(seedUrl)}})c.OnScraped(func(response *colly.Response) {fmt.Println("完成全部抓取")})c.Visit(originUrl)Wg.Wait()
}//获取导航对应首页N页列表
func (cc *CrawCollector) GetDetailCollector(seedUrl string) {c := colly.NewCollector()c.OnHTML(".dlbox .clearfix .pages", func(element *colly.HTMLElement) {lis := element.DOM.Find("li")pageUrl, _ := lis.Last().Find("a").Attr("href")split := strings.Split(pageUrl, "_")ii, _ := strconv.Atoi(strings.Split(split[1], ".")[0])for i := 1; i }//获取每页对应的详情页
func (cc *CrawCollector) GetPageDetailCollector(seedUrl2 string) {c := colly.NewCollector()c.OnHTML(".dlbox .tplist li>a", func(element *colly.HTMLElement) {detailUrl := element.Attr("href")Wg.Add(1)go cc.GetDownloadUrlCollector(detailUrl)})c.Visit(originUrl + seedUrl2)
}//获取下载页面
func (cc *CrawCollector) GetDownloadUrlCollector(detailUrl string) {c := colly.NewCollector()c.OnHTML(".downurllist li>a", func(element *colly.HTMLElement) {if element.Index == 0 {downloadlUrl := element.Attr("href")cc.GetDownloadUrlDetailCollector(downloadlUrl)}})c.Visit(originUrl + detailUrl)
}//获取验证码下载页面
func (cc *CrawCollector) GetDownloadUrlDetailCollector(downLoadDetailUrl string) {c := colly.NewCollector()c.OnHTML("tbody td>a", func(element *colly.HTMLElement) {downloadUrl := element.Attr("href")downloadUrl = strings.Replace(downloadUrl, "..", "", -1)cc.DownloadFile.Url = downloadOriginUrl + downloadUrlWg.Add(1)go cc.FetchPPT(downloadOriginUrl + downloadUrl)})c.Visit(originUrl + downLoadDetailUrl)
}//获取ppt详情下载页面
func (cc *CrawCollector) FetchPPT(dowloadUrl string) {c := colly.NewCollector()defer Wg.Done()c.OnResponse(func(response *colly.Response) {filename := response.FileName()filepath := path.Join(cc.DownloadFile.Directory, filename)if _, err := os.Stat(filepath); err == nil {log.Println("文件已存在:", filename)return}output, err := os.Create(filepath)defer output.Close()if err != nil {log.Println("创建失败: ", err)}_, err = io.Copy(output, bytes.NewReader(response.Body))if err != nil {log.Println("写入失败 ", err)}log.Printf("下载文件 %s/%s", cc.DownloadFile.Directory, filename)})c.Visit(dowloadUrl)
}