HaskellでHTMLをパースする
使い方
hoge.hsに下のコードを保存する。
runghc "hoge.hs" "target.html" ※ダブルクオーテーションは必要
このままでは空行が大量に含まれているので
runghc "hoge.hs" "target.html" | sed '/^ *$/d'
すると良い
--sed '/^ *$/d' import Text.ParserCombinators.Parsec import System.Environment data MyTag = MakeMyTag {tagName::String,tagContent::[String]} deriving (Eq,Show) testMyTag = MakeMyTag "aa" ["bb","cc"] data MyHTMLContent = MakeContentChild { myLeftTag::MyTag,myContentMain::[MyHTMLContent],myRightTag::MyTag } | MakeTagSelf MyTag | MakePlainText { myText::String } | MakeComment deriving (Eq,Show) testHTMLContent1 = (MakePlainText "dd") testHTMLContent2 = (MakeTagSelf testMyTag) testHTMLContent3 = (MakeContentChild testMyTag [testHTMLContent1,testHTMLContent2] testMyTag) outputHTML (MakeComment) n = "" outputHTML (MakePlainText "") n = "" outputHTML (MakePlainText s) n = (sTab n++s++"\n") outputHTML (MakeTagSelf x) n = "" outputHTML (MakeContentChild l con r) n = sCon con (n+1) sTab 0 = "" sTab n = " "++sTab (n-1) sCon [] _ = "" sCon (x:xs) n = (outputHTML x n++sCon xs n) main = do inStr<-getArgs tmp <- parseFromFile myHTML (head inStr) case tmp of Left err -> do {print "error"; } Right xs -> do {putStrLn (func xs); } func :: [MyHTMLContent]->String func [] = "" func (x:xs) = (outputHTML x 0)++(func xs) isNotEqual c = if c=='=' then False else True myDigit :: Parser Char myDigit = oneOf ['0'..'9'] myNumberString :: Parser String myNumberString = do c<-myDigit do cs <- myNumberString return (c:cs) <|> return [c] myTagContentLeft :: Parser String myTagContentLeft = many1 (noneOf "=>/") -- myTagContentLeft = many1 (satisfy (\c -> c /= '=')) --myTagContentLeft = many1 (alphaNum <|> char ':') {-where rest = do{ x<-satisfy (\c -> c /= '=') ; xs<-rest ; return (x:xs) } <|> return "" -} myTagContentRight :: Parser String myTagContentRight = do lD<-char '"' mD<-inC rD<-char '"' return ([lD]++mD++[rD]) <|> myNumberString where inC = do{ x<-satisfy (\c -> c /= '"') ; xs<-inC ; return (x:xs) } <|> return "" myTagContent :: Parser String myTagContent = do left<-myTagContentLeft char '=' right<-myTagContentRight return (left++"="++right) myTagName :: Parser String myTagName = many1 alphaNum myEndTag :: Parser MyTag myEndTag = do string "</" many space name<-myTagName many space char '>' return (MakeMyTag name []) myBeginTag :: Parser MyTag myBeginTag = do char '<' name<-myTagName many space tagContent<-many p char '>' return (MakeMyTag name tagContent) where p = do r<-myTagContent many space return r mySelfTag :: Parser MyTag mySelfTag = do char '<' name<-myTagName many space tagContent<-many p string "/>" return (MakeMyTag name tagContent) where p = do r<-myTagContent many space return r myComment :: Parser MyHTMLContent myComment = do string "<!--" rest "aa" char '>' return MakeComment -- <?> "myComment" where rest before = if before == "--" then return () else do x<-anyChar rest (tail before++[x]) myContent :: Parser MyHTMLContent myContent = do {p<-many1 (satisfy (\c-> c/='<')); return ( MakePlainText p)} <|> myBeginAndEndTag <|> do {t<-try mySelfTag; return (MakeTagSelf t)} <|> try myComment <?> "myContent" {--myContent = try( do {b<-myBeginTag; l<-myContentList; e<-myEndTag; return (MakeContentChild b l e)}) <|> try( do {t<-mySelfTag; return (MakeTagSelf t)}) <|> do {p<-many1 (satisfy (\c-> c/='<')); return ( MakePlainText p)}--} myBeginAndEndTag :: Parser MyHTMLContent myBeginAndEndTag = do b<-try myBeginTag l<-myContentList e<-myEndTag if tagName b == tagName e then return (MakeContentChild b l e) else fail $ "tag don't match "++tagName b++" and "++tagName e myContentList :: Parser [MyHTMLContent] myContentList = (do { r<-myContent; l<-myContentList; return ([r]++l)}) <|> do return [] --myContentList = try (do {r<-myContent; many myComment; l<-myContentList; return ([r]++l)}) myHTML :: Parser [MyHTMLContent] myHTML = do x <- myContentList eof return x