webextractor

package module
v0.1.0 Latest Latest
Warning

This package is not in the latest version of its module.

Go to latest
Published: May 25, 2025 License: MIT Imports: 11 Imported by: 0

README

WebExtractor

Go Reference

Package webextractor provides a ready-to-use and extensible web crawling and scraping framework for extracting structured data from the web.

See Colibri.

Installation

 $ go get github.com/gonzxlezs/webextractor

Do

package main

import (
	"encoding/json"
	"fmt"
	"net/url"

	"github.com/gonzxlezs/webextractor"
	"github.com/gonzxlezs/webextractor/colibri"
)

var rawRules = `{
	"Method": "GET",
	"IgnoreRobotsTxt": true
}`

func main() {
	we, err := webextractor.New()
	if err != nil {
		panic(err)
	}

	var rules colibri.Rules
	err = json.Unmarshal([]byte(rawRules), &rules)
	if err != nil {
		panic(err)
	}

	u, err := url.Parse("https://example.com")
	if err != nil {
		panic(err)
	}

	out, err := we.Do(u, &rules)
	if err != nil {
		panic(err)
	}

	fmt.Println("URL:", out.Response.URL())
	fmt.Println("Status code:", out.Response.StatusCode())
	fmt.Println("Content-Type", out.Response.Header().Get("Content-Type"))
}
URL: https://example.com
Status code: 200
Content-Type text/html

Extract

package main

import (
	"encoding/json"
	"fmt"
	"net/url"

	"github.com/gonzxlezs/webextractor"
	"github.com/gonzxlezs/webextractor/colibri"
)

var rawRules = `{
	"Method": "GET",
	"Selectors": {
		"title": "//title"
	}
}`

func main() {
	we, err := webextractor.New()
	if err != nil {
		panic(err)
	}

	var rules colibri.Rules
	err = json.Unmarshal([]byte(rawRules), &rules)
	if err != nil {
		panic(err)
	}

	u, err := url.Parse("https://example.com")
	if err != nil {
		panic(err)
	}

	output, err := we.Extract(u, &rules)
	if err != nil {
		panic(err)
	}

	fmt.Println("URL:", output.Response.URL())
	fmt.Println("Status code:", output.Response.StatusCode())
	fmt.Println("Content-Type", output.Response.Header().Get("Content-Type"))
	fmt.Println("Data:", output.Data)
}
URL: https://example.com
Status code: 200
Content-Type text/html
Data: map[title:Example Domain]

Documentation

Overview

Package webextractor provides a ready-to-use and extensible web crawling and scraping framework for extracting structured data from the web.

Index

Constants

This section is empty.

Variables

This section is empty.

Functions

func DefaultHTTPTransport

func DefaultHTTPTransport() *http.Transport

func New

func New() (*colibri.Colibri, error)

New returns a new instance of colibri.Colibri with the default configuration.

Types

type Client

type Client struct {
	// Jar specifies the cookie jar.
	Jar *CookieJar
	// contains filtered or unexported fields
}

Client represents an HTTP client. See the colibri.Client interface.

func NewClient

func NewClient() *Client

func (*Client) Clear

func (client *Client) Clear()

func (*Client) Delay

func (client *Client) Delay(u *url.URL, duration time.Duration)

func (*Client) Do

func (client *Client) Do(c *colibri.Colibri, u *url.URL, rules *colibri.Rules) (colibri.Response, error)

type CookieJar

type CookieJar struct {
	// contains filtered or unexported fields
}

CookieJar is a concurrency-safe “cookiejar.Jar” wrapper.

func NewCookieJar

func NewCookieJar() *CookieJar

func (*CookieJar) Clear

func (cj *CookieJar) Clear()

func (*CookieJar) Cookies

func (cj *CookieJar) Cookies(u *url.URL) []*http.Cookie

func (*CookieJar) SetCookies

func (cj *CookieJar) SetCookies(u *url.URL, cookies []*http.Cookie)

type Response

type Response struct {
	HTTP *http.Response
	// contains filtered or unexported fields
}

Response represents an HTTP response. See the colibri.Response interface.

func (*Response) Body

func (resp *Response) Body() io.ReadCloser

func (*Response) Header

func (resp *Response) Header() http.Header

func (*Response) MarshalJSON

func (resp *Response) MarshalJSON() ([]byte, error)

func (*Response) Redirects

func (resp *Response) Redirects() []*url.URL

func (*Response) Serializable

func (resp *Response) Serializable() map[string]any

func (*Response) StatusCode

func (resp *Response) StatusCode() int

func (*Response) URL

func (resp *Response) URL() *url.URL

type RobotsData

type RobotsData struct {
	// contains filtered or unexported fields
}

RobotsData gets, stores and parses robots.txt restrictions. See the colibri.RobotsTxt interface.

func NewRobotsData

func NewRobotsData() *RobotsData

func (*RobotsData) Clear

func (robots *RobotsData) Clear()

Clear removes stored robots.txt restrictions.

func (*RobotsData) IsAllowed

func (robots *RobotsData) IsAllowed(c *colibri.Colibri, u *url.URL, rules *colibri.Rules) (colibri.Response, error)

IsAllowed verifies that the User-Agent can access the URL. Gets and stores the robots.txt restrictions of the URL host and for use in URLs with the same host.

Directories

Path Synopsis
Package colibri is the extensible core for web crawling and scraping, designed to facilitate the extraction of structured data from the web.
Package colibri is the extensible core for web crawling and scraping, designed to facilitate the extraction of structured data from the web.
Package parsers provide implementations of the colibri.Parser interface to parse different web content formats.
Package parsers provide implementations of the colibri.Parser interface to parse different web content formats.

Jump to

Keyboard shortcuts

? : This menu
/ : Search site
f or F : Jump to
y or Y : Canonical URL