반응형
Step 1
- 기존 Scrapper 에서 Web을 이용하여 검색어를 입력해보자.
- 검색어에 맞춰 데이터를 수집하고, 파일을 다운로드 및 삭제해보자.
- `scrapper/scrapper.go` Windows 에서 읽기가 가능하도록 utf8bom 을 설정 추가 파일 삭제가 가능하도록 flush 추가
package scrapper
import (
"encoding/csv"
"fmt"
"log"
"net/http"
"os"
"strconv"
"strings"
"time"
"github.com/PuerkitoBio/goquery"
)
type extractedJob struct {
id string
title string
location string
salary string
summary string
}
func Scrape(term string) {
var baseURL string = "https://kr.indeed.com/jobs?q=" + term + "&limit=50"
fmt.Println("start")
fmt.Println(time.Now())
var jobs []extractedJob
c := make(chan []extractedJob)
totalPages := getPages(baseURL)
for i := 0; i < totalPages; i++ {
go getPage(i, baseURL, c)
}
for i := 0; i < totalPages; i++ {
extractedJobs := <-c
jobs = append(jobs, extractedJobs...)
}
writeJobs(jobs)
fmt.Println("Done, extracted", len(jobs))
fmt.Println("end")
fmt.Println(time.Now())
}
func writeJobs(jobs []extractedJob) {
file, err := os.Create("jobs.csv")
checkErr(err)
utf8bom := []byte{0xEF, 0xBB, 0xBF}
file.Write(utf8bom)
w := csv.NewWriter(file)
defer w.Flush()
defer file.Close()
headers := []string{"Link", "Title", "Location", "Salary", "Summary"}
wErr := w.Write(headers)
checkErr(wErr)
for _, job := range jobs {
jobSlice := []string{"https://kr.indeed.com/jobs?q=python&vjk=" + job.id, job.title, job.location, job.salary, job.summary}
wErr := w.Write(jobSlice)
checkErr(wErr)
}
}
func getPage(page int, baseURL string, mainC chan<- []extractedJob) {
var jobs []extractedJob
c := make(chan extractedJob)
pageURL := baseURL + "&start=" + strconv.Itoa(page*50)
res, err := http.Get(pageURL)
checkErr(err)
checkCode(res)
defer res.Body.Close()
doc, err := goquery.NewDocumentFromReader(res.Body)
checkErr(err)
searchCards := doc.Find(".resultWithShelf")
searchCards.Each(func(i int, card *goquery.Selection) {
go extractJob(card, c)
})
for i := 0; i < searchCards.Length(); i++ {
job := <-c
jobs = append(jobs, job)
}
mainC <- jobs
}
func extractJob(card *goquery.Selection, c chan<- extractedJob) {
id, _ := card.Attr("data-jk")
title := CleanString(card.Find(".jobTitle>span").Text())
location := CleanString(card.Find(".companyLocation").Text())
salary := CleanString(card.Find(".salary-snippet>span").Text())
summary := CleanString(card.Find(".job-snippet").Text())
c <- extractedJob{
id: id,
title: title,
location: location,
salary: salary,
summary: summary}
}
func CleanString(str string) string {
return strings.Join(strings.Fields(strings.TrimSpace(str)), " ")
}
func getPages(baseURL string) int {
pages := 0
res, err := http.Get(baseURL)
checkErr(err)
checkCode(res)
defer res.Body.Close()
doc, err := goquery.NewDocumentFromReader(res.Body)
checkErr(err)
doc.Find(".pagination").Each(func(i int, s *goquery.Selection) {
pages = s.Find("a").Length()
})
return pages
}
func checkErr(err error) {
if err != nil {
log.Fatalln(err)
}
}
func checkCode(res *http.Response) {
if res.StatusCode != 200 {
log.Fatalln("Request failed with Status:", res.StatusCode)
}
}
- `main.go` Echo 서버를 이용해보자
package main
import (
"kjham/learngo_scrapper/scrapper"
"os"
"strings"
"github.com/labstack/echo"
)
func handleHome(c echo.Context) error {
return c.File("home.html")
}
const fileName string = "jobs.csv"
func haandleScrape(c echo.Context) error {
defer os.Remove(fileName)
term := strings.ToLower(scrapper.CleanString(c.FormValue("term")))
scrapper.Scrape(term)
return c.Attachment(fileName, fileName)
}
func main() {
// Echo instance
e := echo.New()
e.GET("/", handleHome)
e.POST("/scrape", haandleScrape)
// Start server
e.Logger.Fatal(e.Start(":1323"))
}
반응형
'SW LAB > Go Lang' 카테고리의 다른 글
[Google Go 언어] 미니 프로젝트 : Scrapper (0) | 2021.11.22 |
---|---|
[Google Go 언어] 미니 프로젝트 : URL Checker (0) | 2021.11.16 |
[Google Go 언어] 미니 프로젝트 : Dictionary (0) | 2021.11.16 |
[Google Go 언어] 미니 프로젝트 : Banking (0) | 2021.11.16 |
[Google Go 언어] 기본 문법 (0) | 2021.11.16 |
댓글