package main import ( "fmt" "log" "os" "strings" "golang.org/x/net/html" ) func main() { fi, err := os.Open(os.Args[1]) if err != nil { log.Fatal(err) } doc, err := html.Parse(fi) if err != nil { log.Fatal(err) } content := findContent(doc) fmt.Println(extractText(content)) } func findContent(n *html.Node) *html.Node { for _, attr := range n.Attr { if attr.Key == "id" && attr.Val == "content" { return n } } for c := n.FirstChild; c != nil; c = c.NextSibling { if found := findContent(c); found != nil { return found } } return nil } func clean(data string) string { data = strings.TrimSpace(data) return data } func extractText(n *html.Node) string { text := "" if n.Type == html.TextNode { return clean(n.Data) } if n.Type == html.ElementNode && n.Data == "br" { return "\n" } for c := n.FirstChild; c != nil; c = c.NextSibling { extr := extractText(c) if extr != "" { text += extr } } if n.Type == html.ElementNode && (n.Data == "p" || n.Data == "div" || n.Data == "li") && !strings.HasSuffix(text, "\n") { text += "\n" } return text } func toMap(content string) map[string]interface{} { return nil }