gsoup

package module
v0.0.0-...-e4f4ca9 Latest Latest
Warning

This package is not in the latest version of its module.

Go to latest
Published: Aug 21, 2018 License: MIT Imports: 8 Imported by: 0

README

Build Status PRs Welcome

gsoup

A tiny web scraper written in Go with similar features to jsoup

Getting started

go get github.com/saopayne/gsoup

Initializing the client

Usage


import (
    "https://github.com/saopayne/gsoup"
    "fmt"
)

// listing of links given a list of urls
// using goroutines and channels
func main() {
	foundUrls := make(map[string]bool)
	seedUrls := os.Args[1:]

	// Channels to hold the concurrent requests
	chUrls := make(chan string)
	chFinished := make(chan bool)

	// Kick off the crawl process (concurrently) using a goroutine
	for _, url := range seedUrls {
		go listLinks(url, chUrls, chFinished)
	}

	// Subscribe to both channels
	for c := 0; c < len(seedUrls); {
		select {
		case url := <-chUrls:
			foundUrls[url] = true
		case <-chFinished:
			c++
		}
	}

	fmt.Sprintf("\nUnique urls found are : %d\n", len(foundUrls))
	for url := range foundUrls {
		fmt.Println(" - " + url)
	}

	close(chUrls)
}

// accessing the DOM elements
func main() {
	resp, _ := gsoup.connect("")
	doc := gsoup.HTMLParse(resp)
	title := doc.Find("div", "id", "id_value").Text()
	image := doc.Find("div", "id", "imageid").Find("img")
	fmt.Println("Text linked to the image :", image.Attrs()["title"])
}

TODO

  • Write unit tests
  • Documentation

CONTRIBUTING

  • Fork the repository, make necessary changes and send the PR.

Documentation

Index

Constants

This section is empty.

Variables

This section is empty.

Functions

func DisableDebug

func DisableDebug()

func EnableDebug

func EnableDebug()

func FindAllofem

func FindAllofem(n *html.Node, args []string) []*html.Node

Using depth first search to find all occurrences and return

func FindOnce

func FindOnce(n *html.Node, args []string, uni bool) (*html.Node, bool)

Using depth first search to find the first occurrence and return

func GetKeyValue

func GetKeyValue(attributes []html.Attribute) map[string]string

Returns a key pair value (like a dictionary) for each attribute

func HtmlToPlainText

func HtmlToPlainText(s string)

Types

type Root

type Root struct {
	Pointer   *html.Node
	NodeValue string
	Error     error
}

Root is a structure containing a pointer to an html node, the node value, and an error variable to return an error if occurred

func HTMLParse

func HTMLParse(s string) Root

HTMLParse parses the HTML returning a start pointer to the DOM

func (Root) Attrs

func (r Root) Attrs() map[string]string

Attrs returns a map containing all attributes

func (Root) Find

func (r Root) Find(args ...string) Root

Find finds the first occurrence of the given tag name, with or without attribute key and value specified, and returns a struct with a pointer to it

func (Root) FindAll

func (r Root) FindAll(args ...string) []Root

FindAll finds all occurrences of the given tag name, with or without key and value specified, and returns an array of structs, each having the respective pointers

func (Root) FindFirstChild

func (r Root) FindFirstChild() Root

FindFirstChild finds the first child of the pointer in the DOM returning a struct with a pointer to it

func (Root) FindLastChild

func (r Root) FindLastChild() Root

FindLastChild finds the last child of the pointer in the DOM returning a struct with a pointer to it

func (Root) FindNextElementSibling

func (r Root) FindNextElementSibling() Root

FindNextElementSibling finds the next element sibling of the pointer in the DOM returning a struct with a pointer to it

func (Root) FindNextSibling

func (r Root) FindNextSibling() Root

FindNextSibling finds the next sibling of the pointer in the DOM returning a struct with a pointer to it

func (Root) FindParent

func (r Root) FindParent() Root

FindParent finds the parent of the pointer in the DOM returning a struct with a pointer to it

func (Root) FindPrevElementSibling

func (r Root) FindPrevElementSibling() Root

FindPrevElementSibling finds the previous element sibling of the pointer in the DOM returning a struct with a pointer to it

func (Root) FindPrevSibling

func (r Root) FindPrevSibling() Root

FindPrevSibling finds the previous sibling of the pointer in the DOM returning a struct with a pointer to it

func (Root) Text

func (r Root) Text() string

Text returns the string inside a non-nested element

Jump to

Keyboard shortcuts

? : This menu
/ : Search site
f or F : Jump to
y or Y : Canonical URL