相关文章:

为提高英语,花两个小时写了个爬取voa上的英文新闻

为提高英语,花两个小时写了个爬取voa上的英文新闻,主要用了goquery这个库,没其他的了,关键是能够解决我自身的需求。如果您很关注性能,或者非常纠结与代码的美观,那么这段代码可能会被喷,anyway,我认为能够用最快的方式解决我问题的就ok了,毕竟我写这个的目的是解决实际的问题。

项目部署在我的树莓派上,使用Notr进行内网穿透,可以通过voa 或者voa 进行访问。

package main

import (
	"fmt"
	"io/ioutil"
	"log"
	"os"
	"strconv"
	"strings"
	"time"

	"github.com/gin-gonic/gin"

	"github.com/PuerkitoBio/goquery"
)

var (
	defaultSite  = "http://www.51voa.com"
	defaultEntry = "http://www.51voa.com/Technology_Report_1.html"
	defaultDir   = "./news"
)

type VOA struct {
	Title    string `json:"title"`
	VoiceUrl string `json:"voice_url"`
	Content  string `json:"content"`
}

func main() {
	go spider(defaultEntry)

	server := gin.Default()
	server.GET("/", func(ctx *gin.Context) {
		dir, err := os.Stat(defaultDir)
		if err != nil {
			log.Printf("stat ./ fail: %v\n", err)
			return
		}

		if dir.IsDir() {
			files, err := ioutil.ReadDir(dir.Name())
			if err != nil {
				log.Printf("read dir fail: %v\n", err)
				return
			}

			list := ""
			for _, file := range files {
				if strings.HasSuffix(file.Name(), ".html") {
					name := strings.TrimSuffix(file.Name(), ".html")

					sp := strings.Split(name, "-")
					if len(sp) > 0 {
						// ignore last number
						if _, err := strconv.Atoi(sp[len(sp)-1]); err == nil {
							sp = sp[:len(sp)-1]
						}
						name = strings.Join(sp, " ")
						item := fmt.Sprintf("<li><a href=\"%s/%s\">%s</a>", defaultDir, file.Name(), name)
						list += item
					}
				}
			}

			indextemp := fmt.Sprintf(index, list)
			ctx.Writer.Write([]byte(indextemp))
		}
	})

	server.Static("/news", "./news")

	server.Run(":8000")
}

// download news
// entry => #list.li.a => get title and detail url
// detail => #mp3.href
// detail => #content
// store without VOA_Special_English
func spider(entry string) {
	for {
		doc, err := goquery.NewDocument(entry)
		if err != nil {
			fmt.Println("open document fail: ", err)
                         time.Sleep(time.Second * 10)
                         // fix: do not return
			continue
		}

		doc.Find("#list").Find("li").Find("a").Each(func(idx int, element *goquery.Selection) {
			href, exist := element.Attr("href")
			if !exist {
				fmt.Println("element not contains href attribute")
				return
			}

			title, err := element.Html()
			if err != nil {
				fmt.Println("element not conains html content: ", err)
				return
			}

			url := defaultSite + href
			detail, err := goquery.NewDocument(url)
			if err != nil {
				fmt.Println("got detail page fail: ", err)
				return
			}

			mp3 := detail.Find("#mp3")
			mp3url, exist := mp3.Attr("href")
			if !exist {
				fmt.Println("not contains mp3 download url")
				return
			}

			content, err := detail.Find("#content").Html()
			if err != nil {
				fmt.Println("not contains voa news content: ", err)
				return
			}

			voa := &VOA{
				Title:    title,
				VoiceUrl: mp3url,
				Content:  content,
			}

			prefix := "VOA_Special_English"
			index := strings.LastIndex(href, "VOA_Special_English") + len(prefix)

			href = href[index:]

			body := fmt.Sprintf(template, voa.Title, voa.Title, voa.VoiceUrl, voa.Content)

			fp, err := os.Create(defaultDir + "/" + href)
			if err != nil {
				fmt.Println("create file fail: ", err)
				return
			}
			defer fp.Close()
			fp.Write([]byte(body))
		})

		time.Sleep(time.Hour * 24)
	}
}

var template = `
	<html>
		<title>%s</title>	
		<link rel="stylesheet" href="https://stackpath.bootstrapcdn.com/bootstrap/4.1.1/css/bootstrap.min.css" integrity="sha384-WskhaSGFgHYWDcbwN70/dfYBj47jz9qbsMId/iRN3ewGhXQFZCSftd1LZCfmhktB" crossorigin="anonymous">    
		<meta name="referrer" content="never">
		<body>
			<div class="container" style="padding:50px">
				<h2>%s</h2>
				<p>
					<audio src="%s" controls="controls">
					Your browser does not support the audio element.
					</audio>
				</p>
				<p>
					%s
				<p>
			</div>
		</body>
	</html>
`

var index = `
	<html>
	<title>voa article</title>	
	<link rel="stylesheet" href="https://stackpath.bootstrapcdn.com/bootstrap/4.1.1/css/bootstrap.min.css" integrity="sha384-WskhaSGFgHYWDcbwN70/dfYBj47jz9qbsMId/iRN3ewGhXQFZCSftd1LZCfmhktB" crossorigin="anonymous">    

	<body>
		<div class="container ">
			<h2>新闻列表</h2>
			<p>
				<ul>
					%s
				</ul>
			</p>
		</div>
	</body>
	</html>
`