HaskellでHTMLをパースする

使い方
 hoge.hsに下のコードを保存する。
 runghc "hoge.hs" "target.html" ※ダブルクオーテーションは必要

 このままでは空行が大量に含まれているので
 runghc "hoge.hs" "target.html" | sed '/^ *$/d'
 すると良い

--sed '/^ *$/d'
import Text.ParserCombinators.Parsec
import System.Environment

data MyTag = MakeMyTag {tagName::String,tagContent::[String]} deriving (Eq,Show)

testMyTag = MakeMyTag "aa" ["bb","cc"]

data MyHTMLContent = MakeContentChild { myLeftTag::MyTag,myContentMain::[MyHTMLContent],myRightTag::MyTag }
	| MakeTagSelf MyTag
	| MakePlainText { myText::String }
	| MakeComment
	deriving (Eq,Show)

testHTMLContent1 = (MakePlainText "dd")
testHTMLContent2 = (MakeTagSelf testMyTag)
testHTMLContent3 = (MakeContentChild testMyTag [testHTMLContent1,testHTMLContent2] testMyTag)

outputHTML (MakeComment) n = ""
outputHTML (MakePlainText "") n = ""
outputHTML (MakePlainText s) n = (sTab n++s++"\n")
outputHTML (MakeTagSelf x) n = ""
outputHTML (MakeContentChild l con r) n = sCon con (n+1)
		
sTab 0 = ""
sTab n = "  "++sTab (n-1)

sCon [] _ = "" 
sCon (x:xs) n = (outputHTML x n++sCon xs n)

main = do
	inStr<-getArgs
	tmp <- parseFromFile myHTML (head inStr)
	case tmp of
		Left err -> do {print "error"; }
		Right xs -> do {putStrLn (func xs); }

func :: [MyHTMLContent]->String
func [] = ""
func (x:xs) = (outputHTML x 0)++(func xs)

isNotEqual c = if c=='='
	then False
	else True

myDigit :: Parser Char
myDigit = oneOf ['0'..'9']

myNumberString :: Parser String
myNumberString = do
	c<-myDigit
	do
			cs <- myNumberString
			return (c:cs)
		<|> return [c]

myTagContentLeft :: Parser String
myTagContentLeft = many1 (noneOf "=>/")
-- myTagContentLeft = many1 (satisfy (\c -> c /= '='))
--myTagContentLeft = many1 (alphaNum <|> char ':')
{-where
		rest = do{ x<-satisfy (\c -> c /= '=')
				; xs<-rest
				; return (x:xs) 
				}
			<|> return ""
-}

myTagContentRight :: Parser String
myTagContentRight = do
		lD<-char '"'
		mD<-inC
		rD<-char '"'
		return ([lD]++mD++[rD])
	<|> myNumberString
		where

			inC = do{ x<-satisfy (\c -> c /= '"')
					; xs<-inC
					; return (x:xs)
					}
				<|> return ""

myTagContent :: Parser String
myTagContent = do
	left<-myTagContentLeft
	char '='
	right<-myTagContentRight
	return (left++"="++right)

myTagName :: Parser String
myTagName = many1 alphaNum

myEndTag :: Parser MyTag
myEndTag = do
	string "</"
	many space
	name<-myTagName
	many space
	char '>'
	return (MakeMyTag name [])

myBeginTag :: Parser MyTag
myBeginTag = do
	char '<'
	name<-myTagName
	many space
	tagContent<-many p
	char '>'
	return (MakeMyTag name tagContent)
		where
			p = do
				r<-myTagContent
				many space
				return r

mySelfTag :: Parser MyTag
mySelfTag = do
	char '<'
	name<-myTagName
	many space
	tagContent<-many p
	string "/>"
	return (MakeMyTag name tagContent)
		where
			p = do
				r<-myTagContent
				many space
				return r

myComment :: Parser MyHTMLContent
myComment = do
		string "<!--"
		rest "aa"
		char '>'
		return MakeComment
	-- <?> "myComment"
	
	where 
		rest before =
			if before == "--"
			then return ()
			else do
				x<-anyChar
				rest (tail before++[x])
				
myContent :: Parser MyHTMLContent
myContent = do {p<-many1 (satisfy (\c-> c/='<')); return ( MakePlainText p)}
	<|> myBeginAndEndTag
		<|> do  {t<-try mySelfTag; return (MakeTagSelf t)}
		<|> try myComment
		<?> "myContent"

{--myContent = try( do {b<-myBeginTag; l<-myContentList; e<-myEndTag; return  (MakeContentChild b l e)})
		<|> try( do  {t<-mySelfTag; return (MakeTagSelf t)})
		<|>  do {p<-many1 (satisfy (\c-> c/='<')); return ( MakePlainText p)}--}

myBeginAndEndTag :: Parser MyHTMLContent
myBeginAndEndTag = do
	b<-try myBeginTag
	l<-myContentList
	e<-myEndTag
	if tagName b == tagName e 
		then return (MakeContentChild b l e)
		else fail $ "tag don't match "++tagName b++" and "++tagName e
	
myContentList :: Parser [MyHTMLContent]
myContentList = (do	{ r<-myContent; l<-myContentList; return ([r]++l)})
			<|> do return []
--myContentList = try (do	{r<-myContent; many myComment; l<-myContentList; return ([r]++l)})

myHTML :: Parser [MyHTMLContent]
myHTML = do
	x <- myContentList
	eof
	return x