convert.go 1.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778
  1. package main
  2. import (
  3. "fmt"
  4. "log"
  5. "os"
  6. "strings"
  7. "golang.org/x/net/html"
  8. )
  9. func main() {
  10. fi, err := os.Open(os.Args[1])
  11. if err != nil {
  12. log.Fatal(err)
  13. }
  14. doc, err := html.Parse(fi)
  15. if err != nil {
  16. log.Fatal(err)
  17. }
  18. content := findContent(doc)
  19. fmt.Println(extractText(content))
  20. }
  21. func findContent(n *html.Node) *html.Node {
  22. for _, attr := range n.Attr {
  23. if attr.Key == "id" && attr.Val == "content" {
  24. return n
  25. }
  26. }
  27. for c := n.FirstChild; c != nil; c = c.NextSibling {
  28. if found := findContent(c); found != nil {
  29. return found
  30. }
  31. }
  32. return nil
  33. }
  34. func clean(data string) string {
  35. data = strings.TrimSpace(data)
  36. return data
  37. }
  38. func extractText(n *html.Node) string {
  39. text := ""
  40. if n.Type == html.TextNode {
  41. return clean(n.Data)
  42. }
  43. if n.Type == html.ElementNode && n.Data == "br" {
  44. return "\n"
  45. }
  46. for c := n.FirstChild; c != nil; c = c.NextSibling {
  47. extr := extractText(c)
  48. if extr != "" {
  49. text += extr
  50. }
  51. }
  52. if n.Type == html.ElementNode &&
  53. (n.Data == "p" || n.Data == "div" || n.Data == "li") &&
  54. !strings.HasSuffix(text, "\n") {
  55. text += "\n"
  56. }
  57. return text
  58. }
  59. func toMap(content string) map[string]interface{} {
  60. return nil
  61. }