parseur

package module
v0.0.0-...-f9966e8 Latest Latest
Warning

This package is not in the latest version of its module.

Go to latest
Published: Jun 7, 2025 License: Apache-2.0 Imports: 13 Imported by: 0

README

parseur

parseur is a simple HTML parser that allows for asynchronous hooks and preemptive cancelling of requests based on the evaluation of hooks.

Features

  • Asynchronous hooks
  • Preemptive request cancellation
  • Simple and intuitive API

Installation

To install the parseur library, run:

go get github.com/muzzletov/parseur

Usage

Here is a simple example of how to use parseur:

package main

import (
	"github.com/muzzletov/parseur"
	"log"
)

func fetchOpenGraphTags() {
	client := parseur.NewClient()

	z := func(p *parseur.Parser) {
		exists := p.Query("head").First().Exists()

		if !exists { // this makes sure we get all the tags
			return
		}

		htmlTags := *p.Query("meta").Get()

		for _, u := range htmlTags {
			if token, ok := u.Attributes["property"]; ok && token == "og:video:tag" {
				p.InBound = func(i int) bool {
					return false
				}
				println(u.Attributes["content"])
			}
		}
	}

	u := "https://www.youtube.com/watch?v=pQO1t2Y627Y"
	_, err := client.FetchParseAsync(&parseur.Request{
		Url:  &u,
		Hook: &z,
	})

	if err != nil {
		log.Fatal(err.Error())
		return
	}
}

func main() {
	fetchOpenGraphTags()
}

API at a Glance

Parsing Functions
  • func NewEscapedParser(body *[]byte) *Parser
  • func NewParser(body *[]byte, async bool, hook *func(p *Parser)) *Parser
  • func (p *Parser) GetBody() []byte
  • func (p *Parser) GetJoinedText(separator byte) string
  • func (p *Parser) GetRoot() *Tag
  • func (p *Parser) GetSize() int
  • func (p *Parser) GetTagMap() map[string]struct{}
  • func (p *Parser) GetTags(query string) *[]*Tag
  • func (p *Parser) GetText() string
  • func (p *Parser) Query(query string) *Query
Query Functions
  • func (q *Query) First() *QueryTag
  • func (q *Query) Last() *QueryTag
  • func (q *Query) Get() *[]*QueryTag
  • func (q *Query) Query(query string) *Query
  • func (qt *QueryTag) Query(query string) *Query
Web Client Functions
  • func NewClient() *WebClient
  • func (c *WebClient) Fetch(url string) (*[]byte, error)
  • func (c *WebClient) FetchParseAsync(request *Request) (p *Parser, err error)
  • func (c *WebClient) FetchParseSync(request *Request) (p *Parser, err error)
  • func (c *WebClient) FetchSync(request *Request) error
  • func (c *WebClient) GetHttpClient() *http.Client
  • func (c *WebClient) LoadCookies()
  • func (c *WebClient) PersistCookies()
  • func (c *WebClient) SetChunkSize(size int)
  • func (c *WebClient) SetUserAgent(agent string)

Examples

For more examples, please refer to the example folder in the repository.

Documentation

Index

Constants

View Source
const (
	FAILED  = -1
	PARSING = 0
)

Variables

This section is empty.

Functions

func MapFromTerms

func MapFromTerms(text string) *map[string]struct{}

Types

type ExtJar

type ExtJar struct {
	// contains filtered or unexported fields
}

func NewJar

func NewJar() *ExtJar

func (*ExtJar) Cookies

func (j *ExtJar) Cookies(u *url.URL) (cookies []*http.Cookie)

func (*ExtJar) Load

func (j *ExtJar) Load(filename string) error

func (*ExtJar) Save

func (j *ExtJar) Save(filename string) error

func (*ExtJar) SetCookies

func (j *ExtJar) SetCookies(u *url.URL, cookies []*http.Cookie)

type Offset

type Offset struct {
	Start int
	End   int
}

type Parser

type Parser struct {
	Done bool

	Complete *bool

	DataChan      chan *[]byte
	ParseComplete chan struct{}

	InBound       func(int) bool
	GetOffsetList func() []*Tag
	Mu            sync.Mutex
	Request       *Request
	// contains filtered or unexported fields
}

func NewEscapedParser

func NewEscapedParser(body *[]byte) *Parser

func NewParser

func NewParser(body *[]byte, async bool, hook *func(p *Parser)) *Parser

func (*Parser) Filter

func (p *Parser) Filter(name string) []*Tag

func (*Parser) First

func (p *Parser) First(name string) *Tag

func (*Parser) GetBody

func (p *Parser) GetBody() []byte

func (*Parser) GetJoinedText

func (p *Parser) GetJoinedText(separator byte) string

func (*Parser) GetRoot

func (p *Parser) GetRoot() *Tag

func (*Parser) GetSize

func (p *Parser) GetSize() int

func (*Parser) GetTagMap

func (p *Parser) GetTagMap() map[string]struct{}

func (*Parser) GetTags

func (p *Parser) GetTags(query string) *[]*Tag

func (*Parser) GetText

func (p *Parser) GetText() string

func (*Parser) Query

func (p *Parser) Query(query string) *Query

type Query

type Query struct {
	// contains filtered or unexported fields
}

func (*Query) First

func (q *Query) First() *QueryTag

func (*Query) Get

func (q *Query) Get() *[]*QueryTag

func (*Query) Last

func (q *Query) Last() *QueryTag

func (*Query) Query

func (q *Query) Query(query string) *Query

type QueryTag

type QueryTag struct {
	*Tag
	// contains filtered or unexported fields
}

func (*QueryTag) Exists

func (qt *QueryTag) Exists() bool

func (*QueryTag) InnerText

func (qt *QueryTag) InnerText() string

func (*QueryTag) OuterText

func (qt *QueryTag) OuterText() string

func (*QueryTag) Query

func (qt *QueryTag) Query(query string) *Query

type Request

type Request struct {
	RequestHeader  *http.Header
	ResponseHeader *http.Header
	Data           *[]byte
	Payload        *[]byte
	Url            *string
	Hook           *func(p *Parser)
	*context.CancelFunc
	Method string
}

type Tag

type Tag struct {
	Name       string
	Namespace  string
	Children   []*Tag
	Attributes map[string]string
	Body       Offset
	Tag        Offset
}

type WebClient

type WebClient struct {
	// contains filtered or unexported fields
}

func NewClient

func NewClient() *WebClient

func (*WebClient) Fetch

func (c *WebClient) Fetch(url string) (*[]byte, error)

func (*WebClient) FetchParseAsync

func (c *WebClient) FetchParseAsync(request *Request) (p *Parser, err error)

func (*WebClient) FetchParseSync

func (c *WebClient) FetchParseSync(request *Request) (p *Parser, err error)

func (*WebClient) FetchSync

func (c *WebClient) FetchSync(request *Request) error

func (*WebClient) GetHttpClient

func (c *WebClient) GetHttpClient() *http.Client

func (*WebClient) LoadCookies

func (c *WebClient) LoadCookies()

func (*WebClient) PersistCookies

func (c *WebClient) PersistCookies()

func (*WebClient) SetChunkSize

func (c *WebClient) SetChunkSize(size int)

func (*WebClient) SetUserAgent

func (c *WebClient) SetUserAgent(agent string)

Directories

Path Synopsis

Jump to

Keyboard shortcuts

? : This menu
/ : Search site
f or F : Jump to
y or Y : Canonical URL