added ordnub-submodule with varying implementations

This commit is contained in:
Nicole Dresselhaus 2013-12-18 11:12:22 +01:00
parent fe7b575800
commit daf87fd737
7 changed files with 4057 additions and 0 deletions

7
submodules/haskell-ordnub/.gitignore vendored Normal file
View File

@ -0,0 +1,7 @@
dist
cabal-dev
*.o
*.hi
*.chi
*.chs.h
.virthualenv

View File

@ -0,0 +1,622 @@
accelerate-examples
acid-state
AC-MiniTest
activehs
adaptive-containers
adict
AERN-Net
aeson-schema
afv
Agata
Agda
air
ajhc
alex
alex-meta
alloy
alms
antfarm
archlinux
archlinux-web
arithmoi
atom
atomo
augur
autoproc
AvlTree
barchart
barley
base
BASIC
BerlekampAlgorithm
berp
bff
bidirectionalization-combined
billeksah-forms
billeksah-pane
billeksah-services
bimap
bindings-bfd
bio
Biobase
BiobaseTrainingData
BiobaseXNA
bitmap
bitmaps
bitset
Blobs
BNFC
BNFC-meta
boomerang
buildwrapper
c0parser
Cabal
cabal2doap
cabal2ghci
cabal2nix
cabal-debian
cabal-install
cabal-install-bundle
cabal-install-ghc72
cabal-install-ghc74
cabal-rpm
cabal-test
cairo
caldims
cap
cassy
cblrepo
cedict
cgen
cgi-utils
chalkboard
chalmers-lava2000
Chart
chp
chp-plus
chp-spec
ChristmasTree
chunks
circ
citeproc-hs
classify
classy-prelude
CMCompare
cmdargs
cmdlib
Coadjute
collada-output
combinatorial-problems
computational-algebra
constructive-algebra
containers
copilot-sbv
CPBrainfuck
CPL
cpsa
cqrs-example
Craft3e
creatur
crf-chain1
cryptocipher
csound-expression
curry-frontend
cyclotomic
DAG-Tournament
darcs
darcs-benchmark
darcs-beta
data-accessor-template
database-study
data-lens-template
data-ordlist
data-store
dclabel-eci11
ddc-core
ddc-core-llvm
debian
deepseq-th
DefendTheKing
delaunay
denominate
derive
derive-gadt
derive-trie
diagrams-builder
disjoint-set
DisTract
djinn
djinn-th
doc-review
doctest
dot2graphml
download-media-content
DPM
DrHylo
DrIFT-cabalized
DSH
dvda
dynamic-linker-template
Ebnf2ps
Elm
Emping
Encode
enumerable
enumerator
EnumMap
enummapset-th
Etage-Graph
extcore
factory
fault-tree
fay
fay-base
fec
feed
feldspar-compiler
FerryCore
fficxx
fgl
fibon
FileManip
filemanip
filestore
fixplate
flite
FM-SBLEX
foo
formal
forml
fquery
frag
freenect
free-theorems
free-theorems-counterexamples
free-theorems-seq
free-theorems-webui
fsmActions
fst
ftshell
Fungi
funsat
GA
game-probability
gbu
gconf
gdiff-th
GenI
geni-gui
genprog
gf
ghci-haskeline
ghci-ng
ghc-make
ghc-mod
GHood
ginsu
gio
git-annex
gitit
glade
glib
gmap
gnomevfs
gofer-prelude
GoogleSB
gpah
GPipe
Graphalyze
graph-rewriting
GraphSCC
graphviz
Grempa
grid
grm
groundhog-th
gruff-examples
gstreamer
gtk
gtk2hs-buildtools
gtkglext
gtkimageview
gtk-mac-integration
gtksourceview2
gt-tools
gulcii
hacanon-light
HackMail
hackport
haddock
haddock-leksah
HaLeX
halfs
hans
happindicator
HAppS-State
happstack-state
happy
happy-meta
HaRe
hark
hashable
hashed-storage
hashring
haskdogs
haskell2010
haskell98
haskelldb
HaskellForMaths
haskell-in-space
haskell-names
Haskelloids
haskell-src-meta
haskell-type-exts
haskell-updater
hasktags
hat
HaVSA
hawitter
HaXml
Hayoo
hbayes
hbro
hbro-contrib
hcube
hdf
heaps
hermit
hfusion
hgettext
hgom
HHDL
hierarchical-clustering
hiernotify
highlighting-kate
himerge
hinduce-missingh
hint
hipe
hjson-query
hlcm
hledger
hledger-lib
hledger-web
hlint
hly
hmatrix-glpk
hmt
Holumbus-Searchengine
homeomorphic
hoogle
hopfield
hpaste
HPath
hs2bf
Hs2lib
hsc3
hsc3-auditor
hsc3-graphs
HSFFIG
hsgsom
HSH
HsParrot
hsqml-morris
hssourceinfo
hstyle
hstzaar
HTab
hTensor
http-conduit
htzaar
hws
hx
HXQ
hxt
hxt-relaxng
hxt-xslt
hyahtzee
hylotab
ideas
ideas-math
idris
ige-mac-integration
igraph
imagepaste
imbib
implicit
improve
INblobs
incremental-parser
inflist
instant-generics
intset
iproute
iptables-helpers
ircbot
iterIO
java-reflect
jmacro
json2
json-b
JSONb
JYU-Utils
kansas-lava
katt
kevin
kibro
KiCS
KiCS-debugger
kit
kmeans-vector
kure
lambdabot
LambdaHack
language-boogie
language-c
language-ecmascript
language-haskell-extract
lazy-csv
ldif
leksah
leksah-server
lens
lenses
Level0
lhae
lhc
lhs2TeX-hl
libcspm
liboleg
LinearSplit
ListLike
listlike-instances
llsd
llvm-base
llvm-general
lostcities
LRU
L-seed
LslPlus
ltk
Lucu
MagicHaskeller
makedo
make-hard-links
manatee-core
markov
mathgenealogy
matsuri
MazesOfMonad
metamorphic
MetaObject
meta-par
MFlow
Mhailist
mines
minesweeper
MissingH
ml-w
module-management
monadiccp
Monadius
MonadLab
Monaris
monitor
mprover
mps
mucipher
multiset
multiset-comb
Munkres
murder
music-diatonic
music-score
narc
nemesis
netcore
nettle-frp
nettle-netkit
network-dns
newtype-th
n-m
noise
NoSlow
notmuch-haskell
nsis
numeric-prelude
NXT
obj
omega
openid
Operads
optimusprime
packed-dawg
packunused
PageIO
pango
paragon
parconc-examples
parsec1
parsec2
parsec3
parsec
parsec-permutation
parsers
parsimony
pec
pesca
pisigma
plugins
plugins-auto
pointfree
polyseq
poppler
prefix-units
prelude-generalize
prelude-plus
presburger
probability
prolog
prolog-graph-lib
PropLogic
Pugs
pugs-compat
pugs-DrIFT
pushme
QuickCheck
quickcheck-poly
quickcheck-script
quickspec
rabocsv2qif
ralist
Rasenschach
rdf4h
reaction-logic
reactive-banana
regexchar
regex-deriv
regex-dfa
regexdot
regex-parsec
regex-pderiv
regex-tdfa
remote
RepLib
RESTng
rezoom
riot
roguestar-engine
rpc-framework
rsagl-frp
ruler-core
safecopy
sat-micro-hs
SBench
sbv
scc
scenegraph
scion
scion-browser
scope
scyther-proof
Semantique
SG
sgf
shake
shake-extras
shaker
shell-escape
shelltestrunner
shuffle
Shu-thing
sifflet-lib
simgi
simple-css
SimpleEA
sirkel
snap-loader-dynamic
snap-predicates
snm
Snusmumrik
SoccerFun
SourceGraph
sourcemap
specialize-th
sphinx
squeeze
standalone-haddock
statechart
static-resources
Strafunski-Sdf2Haskell
stream-fusion
structural-induction
stylish-haskell
supero
svgcairo
svgutils
svm-simple
swish
syb
syb-with-class
sym
SyntaxMacros
tableaux
tagging
tamarin-prover-theory
TCache
teams
template-default
tempus
term-rewriting
TernaryTrees
test-framework-th-prime
testloop
textmatetags
theoremquest
thih
th-instances
TigerHash
timberc
time-recurrence
timestamp-subprocess-lines
todos
toolshed
Top
topkata
toysolver
tpdb
translatable-intset
traverse-with-class
triangulation
TrieMap
trifecta
turni
TypeIlluminator
type-sub-th
uhc-util
UMM
unbound
unboxed-containers
universe-th
unscramble
urlcheck
utility-ht
uuagc
uuagc-bootstrap
uuagc-cabal
uuid
uvector
vacuum-ubigraph
vector-clock
visual-graphrewrite
vte
wai-handler-devel
webkit
wikipedia4epub
Wired
WordNet
WordNet-ghc74
wxc
wxcore
Xec
xfconf
xhaskell-library
xml2x
xmlhtml
xmonad
xmonad-bluetilebranch
xmonad-contrib
xmonad-contrib-bluetilebranch
xmonad-extras
Yablog
ycextra
yesod-comments
yesod-core
yhccore
yi
yices-easy
yst
yuiGrid
yuuko
zeroth
zip-archive

View File

@ -0,0 +1,31 @@
ordnub
======
Data.List.nub is O(n²). This one is O(n log n) by requiring an Ord instance.
* Also contains a benchmark (`report.html`) that shows that `ordNub` apparently is faster than `nub` in *all* cases.
* `PACKAGES_USING_NUB.txt` contains all packages which use `nub` (made with a quick grep).
It's not the most accurate since some packages define their own `nub`, but that's a minority.
**This thing here is not a library.** It is a benchmark suite. [View results here](http://htmlpreview.github.io/?https://github.com/nh2/haskell-ordnub/blob/master/report.html).
Don't use nub
-------------
If you are looking for a fast `ordNub` function to use in your code, you can use:
```haskell
import qualified Data.Set as Set
ordNub :: (Ord a) => [a] -> [a]
ordNub l = go Set.empty l
where
go _ [] = []
go s (x:xs) = if x `Set.member` s then go s xs
else x : go (Set.insert x s) xs
```
### Other Data.List functions you NEVER want to use
`\\`, `union`, `intersect`

View File

@ -0,0 +1,2 @@
import Distribution.Simple
main = defaultMain

View File

@ -0,0 +1,17 @@
name: ordnub
version: 0.1.0.0
synopsis: Faster nub (remove duplicates)
description: Data.List.nub is O(n²). This one is O(n log n) by requiring an Ord instance.
license: MIT
license-file: LICENSE
author: Niklas Hambuechen
maintainer: Niklas Hambuechen <mail@nh2.me>
category: Data
build-type: Simple
cabal-version: >=1.10
executable ordnub
main-is: ordnub.hs
build-depends: base, containers, mtl, criterion, QuickCheck
default-language: Haskell98
ghc-options: -Wall -O2

View File

@ -0,0 +1,226 @@
{-# LANGUAGE ScopedTypeVariables #-}
module Main where
import Control.Monad.State.Strict
import qualified Control.Monad.State.Lazy as SL
import Data.Function (on)
import Data.List (nub, nubBy)
import qualified Data.Map as Map
import qualified Data.Set as Set
import Test.QuickCheck
import Test.QuickCheck.Function
import Criterion.Main
-- Just copied from Data.List
localNub :: (Eq a) => [a] -> [a]
localNub l = nub' l []
where
nub' [] _ = []
nub' (x:xs) ls
| x `elem` ls = nub' xs ls
| otherwise = x : nub' xs (x:ls)
-- Taken From Yi
ordNub :: (Ord a) => [a] -> [a]
ordNub l = go Set.empty l
where
go _ [] = []
go s (x:xs) = if x `Set.member` s then go s xs
else x : go (Set.insert x s) xs
-- Using a state monad
ordNubState :: (Ord a) => [a] -> [a]
ordNubState xs = evalState (filterM f xs) Set.empty
where
f x = do set <- get
if Set.member x set
then return False
else put (Set.insert x set) >> return True
-- Using a lazy state monad
ordNubStateLazy :: (Ord a) => [a] -> [a]
ordNubStateLazy xs = SL.evalState (filterM f xs) Set.empty
where
f x = do set <- SL.get
if Set.member x set
then return False
else SL.put (Set.insert x set) >> return True
-- Using a state monad with a dlist instead of filterM
ordNubStateDlist :: (Ord a) => [a] -> [a]
ordNubStateDlist l = evalState (f l id) Set.empty
where
f [] dlist = return $ dlist []
f (x:xs) dlist = do set <- get
if Set.member x set
then f xs dlist
else put (Set.insert x set) >> f xs (dlist . (x:))
-- Using a lazy state monad with a dlist instead of filterM
ordNubStateLazyDlist :: (Ord a) => [a] -> [a]
ordNubStateLazyDlist l = SL.evalState (f l id) Set.empty
where
f [] dlist = return $ dlist []
f (x:xs) dlist = do set <- SL.get
if Set.member x set
then f xs dlist
else SL.put (Set.insert x set) >> f xs (dlist . (x:))
-- When removing duplicates, the first function assigns the input to a bucket,
-- the second function checks whether it is already in the bucket (linear search).
ordNubBy :: (Ord b) => (a -> b) -> (a -> a -> Bool) -> [a] -> [a]
ordNubBy p f l = go Map.empty l
where
go _ [] = []
go m (x:xs) = let b = p x in case b `Map.lookup` m of
Nothing -> x : go (Map.insert b [x] m) xs
Just bucket
| elem_by f x bucket -> go m xs
| otherwise -> x : go (Map.insert b (x:bucket) m) xs
-- From the Data.List source code.
elem_by :: (a -> a -> Bool) -> a -> [a] -> Bool
elem_by _ _ [] = False
elem_by eq y (x:xs) = y `eq` x || elem_by eq y xs
main :: IO ()
main = defaultMain
[ bgroup "simple"
[ bench "nub [1]" $ nf nub [1::Int]
, bench "nub [1..10]" $ nf nub [1..10::Int]
, bench "nub [1..100]" $ nf nub [1..100::Int]
, bench "nub [1..1000]" $ nf nub [1..1000::Int]
, bench "nub (replicate 1000 1)" $ nf nub (replicate 1000 (1::Int))
, bench "ordNub [1]" $ nf ordNub [1::Int]
, bench "ordNub [1..10]" $ nf ordNub [1..10::Int]
, bench "ordNub [1..100]" $ nf ordNub [1..100::Int]
, bench "ordNub [1..1000]" $ nf ordNub [1..1000::Int]
, bench "ordNub (replicate 1000 1)" $ nf ordNub (replicate 1000 (1::Int))
]
, bgroup ""
[ bench "benchmarks:" $ nf id 'x' -- just so that I can comment out easily
-- , bench "1000 nub" $ nf nub l1000
-- , bench "500 nub" $ nf nub l500
, bench "100 nub" $ nf nub l100
, bench "50 nub" $ nf nub l50
, bench "10 nub" $ nf nub l10
, bench "5 nub" $ nf nub l5
, bench "1 nub" $ nf nub l1
-- , bench "1000 localNub" $ nf localNub l1000
-- , bench "500 localNub" $ nf localNub l500
, bench "100 localNub" $ nf localNub l100
, bench "50 localNub" $ nf localNub l50
, bench "10 localNub" $ nf localNub l10
, bench "5 localNub" $ nf localNub l5
, bench "1 localNub" $ nf localNub l1
-- -- , bench "1000 ordNub" $ nf ordNub l1000
-- -- , bench "500 ordNub" $ nf ordNub l500
, bench "100 ordNub" $ nf ordNub l100
, bench "50 ordNub" $ nf ordNub l50
, bench "10 ordNub" $ nf ordNub l10
, bench "5 ordNub" $ nf ordNub l5
, bench "1 ordNub" $ nf ordNub l1
-- -- , bench "1000 ordNubState" $ nf ordNubState l1000
-- -- , bench "500 ordNubState" $ nf ordNubState l500
, bench "100 ordNubState" $ nf ordNubState l100
, bench "50 ordNubState" $ nf ordNubState l50
, bench "10 ordNubState" $ nf ordNubState l10
, bench "5 ordNubState" $ nf ordNubState l5
, bench "1 ordNubState" $ nf ordNubState l1
-- , bench "1000 ordNubStateLazy" $ nf ordNubStateLazy l1000
-- , bench "500 ordNubStateLazy" $ nf ordNubStateLazy l500
, bench "100 ordNubStateLazy" $ nf ordNubStateLazy l100
, bench "50 ordNubStateLazy" $ nf ordNubStateLazy l50
, bench "10 ordNubStateLazy" $ nf ordNubStateLazy l10
, bench "5 ordNubStateLazy" $ nf ordNubStateLazy l5
, bench "1 ordNubStateLazy" $ nf ordNubStateLazy l1
-- , bench "1000 ordNubStateDlist" $ nf ordNubStateDlist l1000
-- , bench "500 ordNubStateDlist" $ nf ordNubStateDlist l500
, bench "100 ordNubStateDlist" $ nf ordNubStateDlist l100
, bench "50 ordNubStateDlist" $ nf ordNubStateDlist l50
, bench "10 ordNubStateDlist" $ nf ordNubStateDlist l10
, bench "5 ordNubStateDlist" $ nf ordNubStateDlist l5
, bench "1 ordNubStateDlist" $ nf ordNubStateDlist l1
-- , bench "1000 ordNubStateLazyDlist" $ nf ordNubStateLazyDlist l1000
-- , bench "500 ordNubStateLazyDlist" $ nf ordNubStateLazyDlist l500
, bench "100 ordNubStateLazyDlist" $ nf ordNubStateLazyDlist l100
, bench "50 ordNubStateLazyDlist" $ nf ordNubStateLazyDlist l50
, bench "10 ordNubStateLazyDlist" $ nf ordNubStateLazyDlist l10
, bench "5 ordNubStateLazyDlist" $ nf ordNubStateLazyDlist l5
, bench "1 ordNubStateLazyDlist" $ nf ordNubStateLazyDlist l1
-- `by` functions
-- , bench "1000 nubBy" $ nf (nubBy (\a b -> a `quot` 2 == b `quot` 2) (==)) l1000
-- , bench "500 nubBy" $ nf (nubBy (\a b -> a `quot` 2 == b `quot` 2) (==)) l500
, bench "100 nubBy" $ nf (nubBy (\a b -> a `quot` 2 == b `quot` 2)) l100
, bench "50 nubBy" $ nf (nubBy (\a b -> a `quot` 2 == b `quot` 2)) l50
, bench "10 nubBy" $ nf (nubBy (\a b -> a `quot` 2 == b `quot` 2)) l10
, bench "5 nubBy" $ nf (nubBy (\a b -> a `quot` 2 == b `quot` 2)) l5
, bench "1 nubBy" $ nf (nubBy (\a b -> a `quot` 2 == b `quot` 2)) l1
-- , bench "1000 ordNubBy" $ nf (ordNubBy (`quot` 2) (==)) l1000
-- , bench "500 ordNubBy" $ nf (ordNubBy (`quot` 2) (==)) l500
, bench "100 ordNubBy" $ nf (ordNubBy (`quot` 2) (==)) l100
, bench "50 ordNubBy" $ nf (ordNubBy (`quot` 2) (==)) l50
, bench "10 ordNubBy" $ nf (ordNubBy (`quot` 2) (==)) l10
, bench "5 ordNubBy" $ nf (ordNubBy (`quot` 2) (==)) l5
, bench "1 ordNubBy" $ nf (ordNubBy (`quot` 2) (==)) l1
]
-- Other benchmarks, and what people contributed
, bgroup "other"
[ bench "nub yitz 1" $ nf nub (2 : replicate 100000 1 ++ [3] :: [Int])
, bench "ordNub yitz 1" $ nf ordNub (2 : replicate 100000 1 ++ [3] :: [Int])
, bench "nub yitz 2" $ nf nub ([3,2,1] ++ take 100000 (cycle [3,2,1]) ++ [4] :: [Int])
, bench "ordNub yitz 2" $ nf ordNub ([3,2,1] ++ take 100000 (cycle [3,2,1]) ++ [4] :: [Int])
]
]
where
-- l1000 = concat $ replicbate 10 [1..1000::Int]
-- l500 = concat $ replicate 20 [1..500::Int]
l100 = concat $ replicate 100 [1..100::Int]
l50 = concat $ replicate 200 [1..50::Int]
l10 = concat $ replicate 1000 [1..10::Int]
l5 = concat $ replicate 2000 [1..5::Int]
l1 = concat $ replicate 10000 [1::Int]
tests :: IO ()
tests = mapM_ (quickCheckWith stdArgs{ maxSuccess = 1000, maxSize = 200 })
[ isLikeNub localNub
, isLikeNub ordNub
, isLikeNub ordNubState
, isLikeNub ordNubStateDlist
-- ordNubBy tests
, property $ \(l :: [(Int, Int)]) -> ordNubBy fst ((>) `on` snd) l
== nubBy (\(a,b) (x,y) -> a == x && b > y) l
, property $ \(l :: [(Int, Int)], Fun _ f :: Fun Int (Fun Int Bool)) ->
let fun x y = f x `apply` y
in ordNubBy fst (\(_, b) (_, y) -> b `fun` y) l ==
nubBy (\(a,b) (x,y) -> a == x && b `fun` y) l
]
where
isLikeNub f = property (\l -> nub l == f (l :: [Int]))

File diff suppressed because one or more lines are too long