Packages: craigslist
, crawler
, visitor
, scraper
, proxy
, store
export IN_CLOAK_API_KEY=
export MASHAPE_API_KEY=
export KING_PROXY_API_KEY=
export ELASTIC_SEARCH_URL=
Crawler:
import (
"github.com/rentapplication/craigjr/craigslist"
"github.com/rentapplication/craigjr/crawler"
"github.com/rentapplication/craigjr/proxy"
)
proxy := proxy.NewList()
proxy.LoadDefault()
posts := make(chan crawler.PaginationIterator)
go craigslist.StreamCities(posts)
crawlers := crawler.NewPool(proxy)
go crawlers.Crawl(posts)
Visitor:
import (
"github.com/rentapplication/craigjr/proxy"
"github.com/rentapplication/craigjr/store"
)
proxy := proxy.NewList()
proxy.LoadDefault()
visitors := visitor.NewPool(proxy)
go visitors.Visit(crawlers.Urls)
- Proxy List as Http Transport
- Posts scrapes full Urls for each Post
- Pull from City List for Crawlers to produce Post Urls
- Visitors to pull from Url stream
- Index Posts into elastic search
- More Proxy Sources
- Balance pools based on limited resources (automatically preferred)