convert.go 912 B

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162
  1. package main
  2. import (
  3. "fmt"
  4. "log"
  5. "os"
  6. "strings"
  7. "golang.org/x/net/html"
  8. )
  9. func main() {
  10. fi, err := os.Open(os.Args[1])
  11. if err != nil {
  12. log.Fatal(err)
  13. }
  14. doc, err := html.Parse(fi)
  15. if err != nil {
  16. log.Fatal(err)
  17. }
  18. content := findContent(doc)
  19. fmt.Println(extractText(content))
  20. }
  21. func findContent(n *html.Node) *html.Node {
  22. for _, attr := range n.Attr {
  23. if attr.Key == "id" && attr.Val == "content" {
  24. return n
  25. }
  26. }
  27. for c := n.FirstChild; c != nil; c = c.NextSibling {
  28. if found := findContent(c); found != nil {
  29. return found
  30. }
  31. }
  32. return nil
  33. }
  34. func extractText(n *html.Node) string {
  35. text := ""
  36. if n.Type == html.TextNode {
  37. data := strings.TrimSpace(n.Data)
  38. if data != "" {
  39. text += data + "\n"
  40. }
  41. }
  42. for c := n.FirstChild; c != nil; c = c.NextSibling {
  43. extr := strings.TrimSpace(extractText(c))
  44. if extr != "" {
  45. text += extr + "\n"
  46. }
  47. }
  48. return text
  49. }