diff --git a/README.txt b/README.txt new file mode 100644 index 0000000..c40370c --- /dev/null +++ b/README.txt @@ -0,0 +1,28 @@ +How to build: +$ cabal install --only-dependencies +$ cabal configure +$ cabal build + +Create documentation: +$ cabal haddock --executables + +Build and create documentation with all hyperlinks +$ cabal install --only-dependencies --enable-documentation +$ cabal configure +$ cabal build +$ cabal haddock --executables + +Run: +$ dist/build/hgraph/hgraph --help + +How to build (sandboxed, >=cabal 1.18, recommended): +$ cabal sandbox init +$ cabal install --only-dependencies +$ cabal configure +$ cabal build + +How to build (sandboxed, cabal-dev, deprecated): +$ cabal-dev install --only-dependencies +$ cabal-dev configure +$ cabal-dev build + diff --git a/src/DCB/DCB.hs b/src/DCB/DCB.hs index 6b8ab42..3da6849 100644 --- a/src/DCB/DCB.hs +++ b/src/DCB/DCB.hs @@ -1,11 +1,11 @@ -{-# LANGUAGE FlexibleInstances #-} +{-# LANGUAGE FlexibleInstances #-} {-# LANGUAGE OverlappingInstances #-} {-# LANGUAGE TypeOperators #-} {-# LANGUAGE TypeSynonymInstances #-} {-# LANGUAGE BangPatterns #-} ----------------------------------------------------------------------------- -- --- Module : DCB +-- Module : DCB.DCB -- Copyright : -- License : AllRightsReserved -- @@ -13,10 +13,11 @@ -- Stability : -- Portability : -- --- | ---DCB.DCB--------------------------------------------------------------------------- +-- | Computation of densely connected biclusters (DCB). +-- +--------------------------------------------------------------------------- -module DCB.DCB (preprocess, maxDCB, step) where +module DCB.DCB (preprocess, maxDCB, step, expand) where import Util import DCB.Structures import DCB.IO @@ -74,11 +75,18 @@ instance (A.Shape sh, V.Unbox e) => NFData (Array A.U sh e) where {-# INLINE rnf #-} --- | Calculates all maximum DCB. A maximum DCB is a densely connected bicluster that cannot be --- expanded by any additional node without breaking the constraints. +-- | Calculates all maximum DCB for the input seed graphs. A maximum DCB is a densely +-- connected bicluster that cannot be expanded by any additional node without breaking +-- the constraints. -- -- This function does return the seed graphs themselves if they cannot be expanded. -maxDCB :: [Graph] -> Adj -> Attr -> Density -> MaxDivergence -> Int -> [Graph] +maxDCB :: [Graph] -- ^ seed graphs + -> Adj -- ^ global adjacency matrix of all nodes + -> Attr -- ^ global attribute matrix + -> Density -- ^ required minimum graph’s density + -> MaxDivergence -- ^ allowed divergence per attribute + -> Int -- ^ required number of consistent attributes + -> [Graph] maxDCB [] _ _ _ _ _ = [] maxDCB gs adj attr dens maxDiv minHit = let next = L.map (expand adj attr dens maxDiv minHit) gs +|| (parBuffer 1000 rdeepseq) @@ -90,20 +98,34 @@ maxDCB gs adj attr dens maxDiv minHit = -- append maximum solutions of prospective function calls and maximum solutions of this iteration --- | creates a step in iteration. --- Basically calls expand for every Graph left in our List of interesting Graphs --- and returns the expanded ones. -step :: [Graph] -> Adj -> Attr -> Density -> MaxDivergence -> Int -> [Graph] +-- | Yields all DCB that arise by adding one single node to any of the given seed graphs. +-- +-- Basically calls 'expand' for every input graph and returns a list of the expanded ones. +-- Each graph only contains once within the resulting list. +step :: [Graph] + -> Adj -- ^ global adjacency matrix of all nodes + -> Attr -- ^ global attribute matrix + -> Density -- ^ required minimum graph’s density + -> MaxDivergence -- ^ allowed divergence per attribute + -> Int -- ^ required number of consistent attributes + -> [Graph] step gs@((ind,_,_):_) a b c d e = traceEvent ("step from " P.++ show (A.extent ind) ) $ filterLayer $ concat $ map (expand a b c d e ) gs +|| (parBuffer 1000 rdeepseq) --- TODO: remove @((ind,_,_):_) for exhaustive pattern +-- TODO: remove @((ind,_,_):_) for exhaustive pattern, ind only needed for traceEvent --- | calculates all possible additions to one Graph, yielding a list of valid expansions --- i.e. constraint a == Just Constraints for all returned Graphs -expand :: Adj -> Attr -> Density -> MaxDivergence -> Int -> Graph -> [Graph] +-- | Calculates all possible additions to one Graph, yielding a list of valid DCB +-- that fulfill all DCB constraints. +-- +expand :: Adj -- ^ global adjacency matrix of all nodes + -> Attr -- ^ global attribute matrix + -> Density -- ^ required minimum graph’s density + -> MaxDivergence -- ^ allowed divergence per attribute + -> Int -- ^ required number of consistent attributes + -> Graph -- ^ graph to expand + -> [Graph] expand adj attr d div req g@(ind,_,_) = --trace ("expanding graph "P.++ B.unpack (outputGraph [g])) mapMaybe (addPoint adj attr d div req g) (V.toList $ V.findIndices (==True) $ A.toUnboxed $ addablePoints adj g) @@ -231,7 +253,7 @@ addPoint :: Adj -- ^ global adjacency matrix of all nodes -> Attr -- ^ global attribute matrix -> Density -- ^ required minimum graph’s density -> MaxDivergence -- ^ allowed divergence per attribute - -> Int -- ^ equired number of consistent attributes + -> Int -- ^ required number of consistent attributes -> Graph -- ^ base graph -> Int -- ^ node to extend base graph by -> Maybe Graph diff --git a/src/DCB/IO.hs b/src/DCB/IO.hs index 9ce0c42..5fd0088 100644 --- a/src/DCB/IO.hs +++ b/src/DCB/IO.hs @@ -1,5 +1,7 @@ -{-# LANGUAGE OverlappingInstances #-} +{-# LANGUAGE OverlappingInstances #-} {-# LANGUAGE TypeSynonymInstances #-} + +-- | Functions to transform graphs and DCBs into text. module DCB.IO where import Control.Parallel.Strategies @@ -15,14 +17,14 @@ import Util --- | creates a default-formatted output with \",\" in between elements --- and \"\\n\" in between dimensions +-- | creates a default-formatted output with @,@ in between elements +-- and @\\n@ in between dimensions -- -- calls '_outputArray' with preset properties outputArray :: (Unbox a, Show a) => Array U DIM2 a -> B.ByteString outputArray a = _outputArray a "\t" "\n" --- | creates a formatted output from a DIM2 repa-Array +-- | creates a formatted output from a 'DIM2' repa-'Data.Array.Repa.Array' -- -- * First String is the between-element-separator -- @@ -40,8 +42,8 @@ _outputArray a itt lt = B.concat $ | sj-1 == j = show (a!(ix2 i j)) -- no "," for last one.. | otherwise = show (a!(ix2 i j)) ++ itt ++ (_outputArray'' shape i (j+1) a itt) --- | creates a default-formatted output with \",\" in between elements --- and \"\\n\" in between dimensions +-- | creates a default-formatted output with @,@ in between elements +-- and @\\n@ in between dimensions -- -- calls '_outputArray' with preset properties outputGraph :: [Graph] -> B.ByteString @@ -54,7 +56,7 @@ outputGraph gs = B.concat $ L.map (flipto3 _outputGraph "," "\n") (L.sort gs) -- -- * Second String is the between-dimensions-separator -- --- Example Output with \",\" and \"\\n\": +-- Example Output with @,@ and @\\n@: -- -- > Density: -- > 0.7619047619047619 diff --git a/src/DCB/Structures.hs b/src/DCB/Structures.hs index 2a229e3..316176c 100644 --- a/src/DCB/Structures.hs +++ b/src/DCB/Structures.hs @@ -1,6 +1,8 @@ -{-# LANGUAGE FlexibleInstances #-} +{-# LANGUAGE FlexibleInstances #-} {-# LANGUAGE OverlappingInstances #-} {-# LANGUAGE TypeSynonymInstances #-} + +-- | Data structures used for DCB computation. module DCB.Structures where import Data.Array.Repa as A hiding ((++)) @@ -19,7 +21,7 @@ type Attr = Matrix A.U Double -- | Adjacency-Matrix type Adj = Matrix A.U Int8 --- | Matrix storing the extent of a 'Graph'’s constraints fulfillment. +-- | 'Matrix' storing the extent of a 'Graph'’s constraints fulfillment. -- It stores the minimum (zeroth column) and maximum (first column) value of all -- the 'Graph'’s nodes per attribute. -- The 'Vector' stores values of @1@ if the bounds are within the allowed range diff --git a/src/Main.hs b/src/Main.hs index 5caaeab..12fd543 100644 --- a/src/Main.hs +++ b/src/Main.hs @@ -1,4 +1,4 @@ -{-# LANGUAGE BangPatterns #-} +{-# LANGUAGE BangPatterns #-} {-# LANGUAGE CPP #-} {-# LANGUAGE DoAndIfThenElse #-} {-# LANGUAGE OverlappingInstances #-} @@ -14,8 +14,66 @@ -- Portability : -- -- | +-- Program to find densely connected biclusters (DCB) in an undirected graph. +-- DCB are highly connected subgraphs whose nodes share similar attributes. +-- +-- Each node of the source graph is linked with a table of attribute values +-- and for a certain number of attributes all nodes of a DCB must have +-- values within a specified range. Secondly, the density of the +-- subgraph must not exceed a threshold. -- ------------------------------------------------------------------------------ +-- /Using the program:/ +-- +-- >>> hgraph []... +-- +-- The program expects 3 mandatory arguments for an input graph of size /n/: +-- +-- (1) The plain text file that contains the adjacency matrix of the source +-- graph. The file must contain /n/ rows where each row consists of +-- exactly /n/ characters that are either @1@ if there exists an edge +-- between two nodes or @0@ otherwise. +-- The matrix must be symmetric. +-- +-- (2) The plain text file that contains the attributes linked to the nodes. +-- It must consist of /n/ rows. +-- Each row consists of /a/ floating point numbers separated by tab +-- characters @\\t@ where /a/ is the number of attributes. +-- The /b/-th value in the /v/-th row is the attribute value of the /b/-th +-- attribute linked to the /v/-th node. +-- +-- (3) The plain text file that contains the algorithm’s parameters. This file +-- contains 3 rows: +-- +-- * The first row solely contains the minimum density for a DCB as floating +-- point number +-- +-- * The second row solely contains the minimum number of matching +-- attributes for a DCB as an integer value. +-- +-- * The third row contains the tolerances for each attribute as floating +-- point values separated by tab characters @\\t@ where the /b/-th value +-- is the tolerated range for the DCB’s node’s values of the /b/-th +-- attribute. +-- +-- Rows are separated by newline characters @\\n@ and all floating point values +-- use dots as decimal separator. +-- +-- You may prepend the mandatory parameters with certain tokens: +-- +-- * @-h@ or @--help@ to display help (all following parameters are ignored) +-- +-- * @-t@ or @--time@ to measure and display computation time +-- +-- To enable multicore computation you need to append RTS options to the +-- program call as specified in the GHC manual. +-- +-- +-- +-- Running on 4 threads: +-- +-- >>> hgraph adjacency_matrix.adj attribute_matrix.attr properties.p +RTS -N4 +-- +---------------------------------------------------------------------------- module Main where @@ -50,18 +108,19 @@ import System.Environment import System.Exit (exitFailure, exitSuccess) --import Test.QuickCheck.All (quickCheckAll) - -data Params = Params { density :: Double - , matches :: Int - , range :: [Double] +-- | All secondary parameters that must be passed to the program packed into one data type. +data Params = Params { density :: Double -- ^ minimum density of a bicluster + , matches :: Int -- ^ required amount of matching attributes + , range :: [Double] -- ^ allowed divergence for each attribute } deriving (Show,Eq) instance NFData Params --- | Parses a row of the adjacency matrix of a graph. The consistancy of line lengths is not tested +-- | Parses a row of the adjacency matrix of a graph. The consistency of line lengths is not tested -- by this function! In case of a successfull parse a 'Left' ('Vector' a) is returned, otherwise a --- 'Right ByteString' containing an error message. +-- 'Right' 'ByteString' containing an error message. +-- -- > 10101 -- > 01010 -- > 00100 @@ -69,6 +128,7 @@ instance NFData Params -- > 10101 -- -- * Valid Values: @0@, @1@ +-- -- * any invalid value raises an error parseAdjMat :: (Num a, Unbox a) => ByteString -> Either (V.Vector a) ByteString parseAdjMat input = @@ -77,7 +137,7 @@ parseAdjMat input = result = V.unfoldrN size parseAdjMat' input in if size == V.length result then Left result - else Right $ B.append (B.pack "(adjecency)cannot parse ") input + else Right $ B.append (B.pack "(adjacency)cannot parse ") input where --parseAdjMat' :: ByteString -> Maybe (a, ByteString) parseAdjMat' input = @@ -85,26 +145,30 @@ parseAdjMat input = case c of '0' -> Just (0, B.tail input) '1' -> Just (1, B.tail input) - _ -> if isSpace c then parseAdjMat' (B.tail input) - else Nothing + _ -> Nothing --- | Tests if a parse result is considered valid. +-- | Tests if a parse result @(a, 'ByteString')@ is considered valid where @a@ is the parsed object +-- and the 'ByteString' is the remaining string that could not be parsed. testParse :: Maybe (a, ByteString) -> Maybe a testParse Nothing = Nothing testParse (Just (a, rem)) = if emptyLine rem then Just a else Nothing --- | Parses a row of the attribute matrix of a graph. The consistancy of line lengths is not tested --- by this function! In case of a successfull parse a 'Left [a]' is returned, otherwise a --- 'Right ByteString' containing an error message. +-- | Parses a row of the attribute matrix of a graph. The consistency of line lengths is not tested +-- by this function! In case of a successfull parse a @'Left' [a]@ is returned, otherwise a +-- @'Right' 'ByteString'@ containing the error message. +-- -- > 1 2.3 -- > -1 2.1 -- > 4 2.7 -- > 2.2 -3e-4 -- > 3 2.3 -- --- * Valid: Doubles divided by specified delimter +-- * Valid: 'Double's divided by specified delimter +-- -- * any invalid value raises an error -parseAttr :: Char -> ByteString -> Either [Double] ByteString +parseAttr :: Char -- ^ delimiter + -> ByteString -- ^ text to parse + -> Either [Double] ByteString parseAttr delim input = parseAttr' (B.split delim input) where parseAttr' :: [ByteString] -> Either [Double] ByteString parseAttr' (row:rem) = @@ -116,10 +180,18 @@ parseAttr delim input = parseAttr' (B.split delim input) parseAttr' [] = Left [] -- | Parses parameter file. --- First line: Density (Double) --- Second line: requied matches (Int) --- Third line is the tolerance for each attribute (Double values) -parseParams :: Char -> [ByteString] -> Either Params ByteString +-- +-- * First line: 'Density' +-- +-- * Second line: requied matches ('Int') +-- +-- * Third line is the tolerance for each attribute ('Double' values), see 'parseAttr' +-- +-- In case of an error during the parsing a @'Right' 'ByteString'@ containing the error +-- message is returned instead of a @'Left' 'Params'@ +parseParams :: Char -- ^delimiter + -> [ByteString] -- ^ text to parse + -> Either Params ByteString parseParams delim input | length input /= 3 = Right $ B.pack ("(params)amount of lines does not match (expected 3, got " ++ show (length input) ++ ")") @@ -134,7 +206,7 @@ parseParams delim input _ -> Right $ B.append (B.pack "(param - density)cannot parse ") (head input) --- | checks if a given Text is empty (Empty String, whitespaces) +-- | Checks if a given text is empty (empty string, whitespaces). emptyLine :: ByteString -> Bool emptyLine a | B.null a = True @@ -171,21 +243,21 @@ doCalculation adj attr p = doAll' gs a b c d e = gs ++ doAll' (step gs a b c d e) a b c d e --} --- | gets the length of the 'Left a'. +-- | Gets the length of the @'Left' [a]@. -- --- @0@ if Left a empty or no valid constructor. +-- @0@ if @[a]@ in @Left [a]@ is empty or it is a 'Right' value. getLength :: Either [a] b -> Int getLength (Left a) = length a getLength (Right _) = 0 --- | gets the length of the Left ('Vector a'). +-- | Gets the length of the @'Left' ('Vector' v)@. -- --- @0@ if Left a empty or no valid constructor. +-- @0@ if @a@ in 'Left a' is empty or it is a 'Right' value. getLengthV :: (Unbox a) => Either (V.Vector a) b -> Int getLengthV (Left a) = V.length a getLengthV (Right _) = 0 --- | prints the Help and exits +-- | Prints the help and exits. showHelp :: IO () showHelp = do putStrLn $ "Usage: hgraph []... \n"++ @@ -209,7 +281,7 @@ showHelp = do "\n" exitSuccess --- | checks if the submitted Text is empty. If not it will be printed out and the program aborts +-- | Checks if the submitted text is empty. If not it will be printed out and the program aborts. checkError :: ByteString -> IO () checkError a | emptyLine a = return () @@ -221,7 +293,7 @@ checkError a -- changed to return () to disable Debug. debug a = return () --putStrLn a --- | Removes one trailing carriage return character @\\r@ if existant. +-- | Removes one trailing carriage return character @\\r@ if existent. removeCarriageReturn :: ByteString -> ByteString removeCarriageReturn input = if B.last input == '\r' then B.init input @@ -237,11 +309,12 @@ removeCarriageReturn input = -- invalid tokens) that parameter and all following parameters are -- believed to be file paths. -- --- The returned tupel contains the existance of the /time/ token, --- existance of the /help/ token and all remaining file paths in that --- order. --- >>> processTokens ["--time", "-t", "foo.txt"] == (True, False, ["foo.txt"]) --- >>> processTokens ["-tx", "--help"] == (False, False, ["-tx", "--help"]) +-- The returned tupel contains the existance of the /time/ token, +-- existance of the /help/ token and all remaining file paths in that +-- order. +-- +-- >>> processTokens ["--time", "-t", "foo.txt"] == (True, False, ["foo.txt"]) +-- >>> processTokens ["-tx", "--help"] == (False, False, ["-tx", "--help"]) processTokens :: [String] -> (Bool, Bool, [FilePath]) processTokens [] = (False, False, []) -- single token with prefix "--" @@ -310,20 +383,20 @@ main = do if adjLines /= attrLines then checkError $ B.pack $ "Adjacency Matrix size "++ show adjLines ++ " differs from Attribute Matrix " ++ show attrLines ++ - ".\n" + "." else return () if adjLines /= adjNum then checkError $ B.pack $ "Adjacency Matrix is not square.\n" ++ "Read format is " ++ show adjLines ++ - "x" ++ show adjNum ++ ".\n" + "x" ++ show adjNum ++ "." else return () -- it is accaptable if the parameters file contains more attributes than the attribute matrix if attrParams < attrNum then checkError $ B.pack $ "Attribute Matrix format does not match Parameter.\n" ++ "Attribute Matrix has " ++ show attrNum ++ " attributes.\n" ++ - "Parameters implicate" ++ show attrParams ++ " attributes.\n" + "Parameters implicate " ++ show attrParams ++ " attributes." else return () ----- EXTRACT MATRICES diff --git a/src/Util.hs b/src/Util.hs index f40697a..f5dea9f 100644 --- a/src/Util.hs +++ b/src/Util.hs @@ -1,8 +1,8 @@ +-- | A collection of utility functions for working with functions, lists and strings. module Util where import Control.Parallel.Strategies import qualified Data.Map as Map -import qualified Data.Set as Set -- | Move first argument to first place (for style uniformity) flip1 :: (a -> b) -> (a -> b) @@ -83,6 +83,8 @@ infixl 1 +|| (+||) :: a -> Strategy a -> a a +|| b = a `using` b +-- | The function 'appendS' appends the string representation of the third element +-- to the second element followed by the first element as separator string. appendS :: (Show a) => String -> String -> a -> String appendS sep a b = (a ++ show b) ++ sep @@ -106,8 +108,8 @@ remDupsBy f (l:ls) = l:(remDups' l ls) -- When removing duplicates, the first function assigns the input to a bucket, -- the second function checks whether it is already in the bucket (linear search). --- | /O(n^2)/ Removes duplicate elements from a list. Performs better than --- 'Prelude.nub' by exploiting features of the 'Ord' class. +-- | /O(n^2)./ Removes duplicate elements from a list. Performs better than +-- 'Data.List.nub' by exploiting features of the 'Ord' class. ordNubBy :: (Ord b) => (a -> b) -> (a -> a -> Bool) -> [a] -> [a] ordNubBy p f l = go Map.empty l where @@ -124,14 +126,14 @@ ordNubBy p f l = go Map.empty l elem_by eq y (x:xs) = y `eq` x || elem_by eq y xs --- | Returns weather a string only contains whitespace or not. +-- | Returns whether a string only contains whitespace characters or not. isWhitespace :: String -> Bool isWhitespace "" = True isWhitespace (a:as) = (a `elem` " \r\n\t") && isWhitespace as -- | Tests whether an 'Either' type is 'Left'. isLeft :: Either a b -> Bool -isLeft a = case a of Left _ -> True; _ -> False +isLeft a = either (\e -> True) (\e -> False) a -- | Tests whether an 'Either' type is 'Right'. isRight :: Either a b -> Bool @@ -139,7 +141,7 @@ isRight = not . isLeft -- | The 'part' function takes a predicate and a list of tuples and returns -- the pair of lists of left elements which do and right elements which do not satisfy the --- predicate, respectively; i.e., +-- predicate, respectively; i. e., -- -- > part (\a b -> elem a b) [(1, [1, 3]), (2, [3, 4]), (0, [0, 5])] == ([1, 0], [[3, 4]]) part :: (a -> b -> Bool) -> [(a, b)] -> ([a], [b])