WITH x(v) AS (VALUES (1),(2),(NULL)) SELECT l.v, r.v, l.v = r.v AS equality, l.v IS NOT DISTINCT FROM r.v AS isnotdistinctfrom FROM x l, x r;
Thanks to Sam Mason via the PostgreSQL - general mailing list.
Work-products and stray thoughts from the Land of Entrapment
WITH x(v) AS (VALUES (1),(2),(NULL)) SELECT l.v, r.v, l.v = r.v AS equality, l.v IS NOT DISTINCT FROM r.v AS isnotdistinctfrom FROM x l, x r;
## box-and-whisker plot of FFT power spectrum by frequency/period ## vary these parameters -- r as per logistic map ## see http://en.wikipedia.org/wiki/Logistic_map for details logmap = function(n, r, x0=0.5) { ret = rep(x0,n) for (ii in 2:n) { ret[ii] = r*ret[ii-1]*(1-ret[ii-1]) } return(ret) } ## modify these for interesting behavior differences rfrom = 3.4 rto = 3.7 rsteps=200 seqlen=1e4 require(plyr) require(ggplot2) mytimeseries = aaply(seq(from=rfrom, to=rto, length.out=rsteps), 1, function(x) { logmap(seqlen, x) }) ## you can plug in any array here for mytimeseries ## each row is a timeseries ## for series by column, change the margin from 1 to 2, below logspec = adply( mytimeseries, 1, function(x) { ## change spec.pgram parameters as per goals ret = spectrum(x, taper=0, demean=T, pad=0, fast=T, plot=F) return( data.frame(freq=ret$freq, spec=ret$spec)) }) ## boxplot stats at each unique frequency logspec.bw = ddply(logspec, 'freq', function(x) { ret = boxplot.stats(log10(x$spec)); ## sometimes $out is empty, bind NA to keep dimensions correct return(data.frame(t(10^ret$stats), out=c(NA,10^ret$out))) }) ## plot -- outliers are dots ## be careful with spacing of hard returns -- ggplot has odd rules ## as close to default "spectrum" plot as possible ggplot(logspec.bw, aes(x=1/(freq))) + geom_ribbon(aes(ymin=X1, ymax=X5), alpha=0.35, fill='green') + geom_ribbon(aes(ymin=X2, ymax=X4), alpha=0.5, fill='blue') + geom_line(aes(y=X3)) + geom_point(aes(y=out), color='darkred') + scale_x_continuous( trans=c('inverse'), name='Period') + scale_y_continuous(name='Power', trans='log10')
require(animation) saveVideo({ ani.options(interval = 1/5, ani.type='png', ani.width=1600, ani.height=1200) for (ii in c(seq(from=3.2, to=3.5, length.out=200), seq(from=3.5, to=4, length.out=1200))) { spectrum(logmap(1e4, ii), taper=0, demean=T, pad=0, fast=T, sub=round(ii, 3)) } }, video.name = "logmap2.mp4", other.opts = "-b 600k")
## using R -- ## logistic map function: logmap = function(n, r, x0=0.5) { ret = rep(x0,n) for (ii in 2:n) { ret[ii] = r*ret[ii-1]*(1-ret[ii-1]) } return(ret) } ## compare: spectrum(logmap(1e5, 3.5), main='R tapers to 0.1 by default') spectrum(logmap(1e5, 3.5), taper=0, main='No taper') spectrum(logmap(1e5+1, 3.5), taper=0, main='Non-composite length, fast=T pads by default') spectrum(logmap(1e5, 3.5), taper=0.5, sub='Lots of padding approximately recovers "correct" results')
## use aaply -- the simplest code require(plyr) testfun0 <- function(x, y) aaply(x, 1, function(x) rbinom(1, x, y) ## rewrite the above as an explicit loop testfun1 = function(nrep, x, y) { ## loop index ii<-0; ## result vector ret<-rep(0, nrep); while(ii < nrep) { ii<-ii+1; ## how many successes for each element of bb? ret[ii] <- rbinom(1, x[ii], y) } return(ret) }
## define source code as string, pass to inline src <- ' // Cast input parameters to correct types // clone prevents direct modification of the input object IntegerVector tmp(clone(x)); double rate = as< double >(y); // IntegerVector inherits from STL vector, so we get standard STL methods int tmpsize = tmp.size(); // initialize RNG from R set.seed() call RNGScope scope; // loop for (int ii =0; ii < tmpsize; ii++) { // Rf_rbinom is in the R api // For more details, See Rcpp-quickref.pdf, "Random Functions" // also, Rcpp provides "safe" accessors via, e.g., tmp(ii) tmp(ii) = Rf_rbinom(tmp(ii), rate); }; // tmp points to a native R object, so we can return it as-is // if we wanted to return, e.g., the double rate, use: // return wrap(rate) return tmp; ' require(inline) ## compile the function, inspect the process with verbose=T testfun2 = cxxfunction(signature(x='integer', y='numeric'), src, plugin='Rcpp', verbose=T) ## timing function ps <- function(x) print(system.time(x))
## Input vector bb <- rbinom(1e5, 20, 0.5) ## test each case for 2 different loop lengths for( nrep in c(1e4, 1e5)){ ## initialize RNG each time with the same seed ## plyr set.seed(20); ps(cc0<- testfun0(nrep[1:nrep], bb, 0.1)) ## explicit while loop set.seed(20); ps(cc1<- testfun1(nrep, bb, 0.1)) ## Rcpp set.seed(20); ps(cc2<- testfun2(bb[1:nrep], 0.1)) print(all.equal(cc1, cc2)) } ## output user system elapsed 3.392 0.020 3.417 user system elapsed 0.116 0.000 0.119 user system elapsed 0.000 0.000 0.002 [1] TRUE user system elapsed 37.534 0.064 37.601 user system elapsed 1.228 0.000 1.230 user system elapsed 0.020 0.000 0.021 [1] TRUE
## Using Rcpp, inline and STL algorithms and containers ## use the std::binary_function templated struct inc <- ' struct Rb : std::binary_function< double, double, double > { double operator() (double size, double prob) const { return ::Rf_rbinom(size, prob); } }; ' ## define source code as a character vector, pass to inline src <- ' NumericVector sz(x); RNGScope scope; return NumericVector::import_transform(sz.begin(), sz.end(), std::bind2nd(Rb(), as< double >(y))); ' ## compile the function, inspect the process with verbose=TRUE f5 <- cxxfunction(signature(x='numeric', y='numeric'), src, 'Rcpp', inc, verbose=TRUE)
myhist = function(x, dig=3) { x=trunc(x, digits=dig); ## x=round(x, digits=dig); aa = bb = seq(0,1,1/10^dig); for (ii in 1:length(aa)) { aa[ii] = sum(x==aa[ii]) }; return(cbind(bin=bb, dens=aa/length(x))) } ## random variates test = sort(runif(1e4)) get1 = myhist(test)
## their method require(ash) get2 = bin1(test, c(0,1), 1e3+1)$nc
mymatrix = matrix(1:10, nrow=10) apply(mymatrix, 1, sum) apply(mymatrix, 2, sum)
rowSums(mymatrix) colSums(mymatrix)
myarray = array(0, dim=c(3,4,5,6,10), dimnames=list(a=1:3, b=1:4, c=1:5, d=letters[1:6], e=LETTERS[1:10])) melt( adply(myarray, 1:4, I))
An unexpeceted bonus? This way of thinking lends itself nicely to thinking of explicit parallelism. If you can describe the problem as a single recipe that's done for each "piece" of a whole, then you're one step away from using all the cores on your machine to solve a problem with foreach. Need to apply some sort of long-running analysis to each element of a list, and want to return the results as a list? Why write a for loop when you can do:
## prep ## warning -- this may not be fast ## use smaller matrices for smaller machines! nn = 2^20 rr = 2^10 mylist = list(a=matrix(rnorm(nn), nrow=rr), b=matrix(rnorm(nn), nrow=rr), c=matrix(rnorm(nn), nrow=rr)) ## analysis myeigs = foreach( ii = iter(mylist), .combine=c) %do% { print('ping'); return(list(eigen(ii)))}
require(multicore) require(doMC) ## use the appropriate # of cores, of course registerDoMC(cores=4) myeigs1 = foreach( ii = iter(mylist), .combine=c) %dopar% { print('ping'); return(list(eigen(ii)))}
Error: cannot allocate vector of size 818.6 Mb Enter a frame number, or 0 to exit ... 4: mk.bootrep(zz, 75) 5: mk.boot.R#165: aaply(tmpreps, .margins = 1, function(x) { 6: laply(.data = pieces, .fun = .fun, ..., .progress = .progress, .drop = .dro 7: list_to_array(res, attr(.data, "split_labels"), .drop) 8: unname(unlist(res))
dimcurse = function(dims=1:1e3, .sd=c(1e-2, 0.1, .2, .5, 1), reps=1e3, cores=4, rfun=rnorm, .methods=c('manhattan', 'euclidean', 'maximum'), .verbose=F){ ## examine distance between 2 points in dims dimensions for different metrics ## chosen at random according to rfun (rnorm by default) ## require(foreach) require(doMC) registerDoMC registerDoMC(cores=cores) ## use multicore/foreach on the outside (each sd) finret = foreach(ss=iter(.sd), .combine=rbind, .verbose=.verbose) %dopar% { ## pre-allocate, don't append/rbind! ret = data.frame(.sd=1:(reps*length(dims))*length(methods), .dim=1, .rep=1, .dist=1, method=factor(NA, levels=.methods)) ii = 1 for (.dim in dims) { for (method in .methods) { ## reset .rep counter .rep = 1 while (.rep <= reps) { ## 2 points, one per row, randomly chosen tmp = matrix(rfun(2*.dim, sd=ss), nrow=2) ## calc dist mydist = as.numeric(dist(tmp, method=method)) ## fill return dataframe ret[ii, ] = list(ss, .dim, .rep, mydist, method) ## update rep counter .rep = .rep +1 ## update ret row counter ii = ii +1 } } } return(ret) } ## rbind happens here return(finret) }For plotting, I've used:
xyplot(.dist ~ (.dim)|method, groups=.sd, data =aa, pch='.', par.settings=list(background=list(col='darkgrey')), layout=c(3,1), scales=list(y=list(relation='free'), alternating=1), auto.key=list(space='bottom', columns=5, points=F, lines=T), xlab='Dimension', ylab='Distance', main='Distance between 2 N-dimensional points \n drawn from a normal distribution with varying sd (legend) using different metrics (columns)', ) xyplot(.dist ~ (.dim)|as.character(.sd), groups=method, data =aa, pch='.', par.settings=list(background=list(col='darkgrey')), layout=c(5,1), scales=list(y=list(relation='free'), alternating=1), auto.key=list(space='bottom', columns=3, points=F, lines=T), xlab='Dimension', ylab='Distance', main='Distance between 2 N-dimensional points \n drawn from a normal distribution with varying SD (columns) using different metrics (legend)', )
$ ssh -f -R 10187:localhost:10187 remotehost sleep 600 ## 10 minutes to set up connection > require(snow); require(foreach); require(doSNOW); > cl = makeCluster(rep('localhost',3), type='SOCK', manual=T) ## wait for 3 slaves ## Use ssh to start the slaves on remotehost and connect to the (local) forwarded port $ for i in {1..3}; $ do ssh -f remotehost "/usr/lib64/R/bin/Rscript ~/R/x86_64-pc-linux-gnu-library/2.12/snow/RSOCKnode.R $ MASTER=localhost PORT=10187 OUT=/dev/null SNOWLIB=~/R/x86_64-pc-linux-gnu-library/2.12"; $ done ## R should have picked up the slaves and returned. If not, something is wrong. ## Back up and test ssh connectivity, etc. > registerDoSNOW(cl) > a <- matrix(1:4e6, nrow=4) > b <- t(a) > bb <- foreach(b=iter(b, by='col'), .combine=cbind) %dopar% > (a %*% b)
create schema transtest; set searchpath to transtest,public; -- it's nice to have a sandbox -- remember to reset searchpath when done create table myents ( id serial primary key, sum integer); -- the entries table create table myrecs ( id serial primary key, entid integer references myents (id), cases integer); -- the reports table, many reports per entry create table myids ( id integer primary key references myents (id)); -- table used to store trigger state
-- function for when-clause (condition) in trigger -- add id to the "to-do" list if it's not there already -- return True only if it's been added to the least create or replace function f_condition(myid integer) returns boolean as $_$ DECLARE myret boolean := True; newid integer; BEGIN RAISE NOTICE 'Return Value here is %', myret; select count(*) into newid from myids where id = myid; if newid > 0 then -- id is there, return False myret := False; else -- add id to queue, return True insert into myids (id) values (myid); end if; return myret; END; $_$ language plpgsql;
-- trigger function to rebuild myent.sum for this myrecs.id create or replace function f_refresh() returns trigger as $_$ DECLARE mysum integer; myid integer; BEGIN IF (TG_OP = 'DELETE') THEN myid := OLD.entid; ELSE myid := NEW.entid; END IF; RAISE NOTICE 'Return Value here is %', myid; select sum(cases) into mysum from myrecs where entid = myid; -- compute entry sum from records and update it. -- PL/pgSQL rules on queries are quirky. Read the docs. update myents set sum = mysum where id = myid; delete from myids where id = myid; -- clean up RETURN NULL; END; $_$ language plpgsql;
-- only fire trigger once per myent.id using when clause (condition) -- eval is deferred until transaction concludes create constraint trigger t_cache after insert or update on myrecs initially deferred for each row when ( f_condition(NEW.entid) ) -- comment out for pre pg 9.0 execute procedure f_refresh(); -- need 2, one for each for NEW (insert and update) and OLD (delete) create constraint trigger t_cache_del after delete on myrecs initially deferred for each row when ( f_condition(OLD.entid) ) execute procedure f_refresh();
insert into myents (sum) VALUES ( 0); begin; insert into myrecs (entid, cases) VALUES ( 1, 0), (1, 1), (1,2); select * from myents; -- 0 select * from myids; -- entid from above commit; select * from myents; -- new sum select * from myids; -- empty