bufr.pm:bufrextract.pl_source

Differences

This shows you the differences between two versions of the page.

Link to this comparison view

Both sides previous revision Previous revision
bufr.pm:bufrextract.pl_source [2025-11-05 09:28:24]
pals
bufr.pm:bufrextract.pl_source [2026-03-26 17:56:54] (current)
pals
Line 2: Line 2:
 #!/usr/bin/perl #!/usr/bin/perl
  
-# (C) Copyright 2010-2025 MET Norway+Copyright (C) 2010-2026 MET Norway
 # #
 # This program is free software; you can redistribute it and/or modify # This program is free software; you can redistribute it and/or modify
Line 30: Line 30:
 my %option = (); my %option = ();
 GetOptions( GetOptions(
-           \%option, +    \%option, 
-           'ahl=s',        # Extract BUFR messages with AHL matching <ahl_regexp> only +    'ahl=s',           # Extract BUFR messages with AHL matching <ahl_regexp> only 
-           'gts',          # Include full gts message envelope if present +    'gts',             # Include full gts message envelope if present 
-           'help',         # Print help information and exit +    'filter=s',        # Extract BUFR messages meeting the <metadata criteria> only 
-           'only_ahl',     # Extract AHLs only +    'help',            # Print help information and exit 
-           'outfile=s',    # Print to file instead of STDOUT +    'only_ahl',        # Extract AHLs only 
-           'verbose=i',    # Set verbose level to n, 0<=n<=6 (default 0) +    'outfile=s',       # Print to file instead of STDOUT 
-           'without_ahl',  # Print the BUFR messages only, skipping AHLs +    'verbose=i',       # Set verbose level to n, 0<=n<=6 (default 0) 
-       ) or pod2usage(-verbose => 0);+    'without_ahl',     # Print the BUFR messages only, skipping AHLs 
 +    ) or pod2usage(-verbose => 0);
  
 # User asked for help # User asked for help
Line 57: Line 58:
 Geo::BUFR->set_verbose($option{verbose}) if $option{verbose}; Geo::BUFR->set_verbose($option{verbose}) if $option{verbose};
  
 +# For filtering on ahl
 my $ahl_regexp; my $ahl_regexp;
 if ($option{ahl}) { if ($option{ahl}) {
Line 62: Line 64:
     die "Argument to --ahl is not a valid Perl regular expression: $@" if $@;     die "Argument to --ahl is not a valid Perl regular expression: $@" if $@;
 } }
 +
 +# For filtering on metadata in section 0/1
 +my $filter = $option{filter} ? $option{filter} : '';
 +my $or_criteria_ref = get_filter_criteria($filter);
  
 # Where to direct output (including verbose output, but not output to STDERR) # Where to direct output (including verbose output, but not output to STDERR)
Line 79: Line 85:
 foreach my $inputfname ( @ARGV ) { foreach my $inputfname ( @ARGV ) {
     my $bufr = Geo::BUFR->new();     my $bufr = Geo::BUFR->new();
-    $bufr->set_filter_cb(\&filter_on_ahl,$ahl_regexp) if $option{ahl};+ 
 +    # Could alternatively have merged filtering on ahl and metadata into 
 +    # one single callback function, but that would be a rather complex 
 +    # one, so we prefer to do the filtering on metadata later 
 +    $bufr->set_filter_cb(\&filter_on_ahl, $ahl_regexp) if $option{ahl};
  
     # Open BUFR file     # Open BUFR file
Line 123: Line 133:
         }         }
  
-        next if $option{ahl} && $bufr->is_filtered();+ # Filtering on ahl 
 +        next READLOOP if $option{ahl} && $bufr->is_filtered(); 
 + 
 + # Filtering on metadata 
 +        next READLOOP if $or_criteria_ref && not or_filter($bufr, $or_criteria_ref); 
         # Skip messages where stated length of BUFR message is sure to         # Skip messages where stated length of BUFR message is sure to
         # be erroneous, unless we want ahls only (or should we skip         # be erroneous, unless we want ahls only (or should we skip
         # message in this case also? Hard choice...)         # message in this case also? Hard choice...)
-        next if !$option{only_ahl} && $bufr->bad_bufrlength();+        next READLOOP if !$option{only_ahl} && $bufr->bad_bufrlength();
  
         my $current_subset_number = $bufr->get_current_subset_number();         my $current_subset_number = $bufr->get_current_subset_number();
Line 156: Line 171:
         my $msg = $bufr->get_bufr_message();         my $msg = $bufr->get_bufr_message();
         print $OUT $msg, $gts_eom;         print $OUT $msg, $gts_eom;
-    }+  }
 } }
  
Line 167: Line 182:
     return $ahl =~ $ahl_regexp ? 0 : 1;     return $ahl =~ $ahl_regexp ? 0 : 1;
 } }
 +
 +# Get the list of alternative metadata criteria (these are separated
 +# by '|', see pod)
 +sub get_filter_criteria {
 +    my $filter = shift;
 +    return ('') if ! $filter;
 +
 +    my @or_criteria;
 +    my @criteria = split /[|]/, $filter;
 +    foreach my $cr (@criteria) {
 +        $cr =~ s/^\s+//;
 +        $cr =~ s/\s+$//;
 +        if ($cr ne '') {
 +            push @or_criteria, $cr;
 +        }
 +    }
 +    return \@or_criteria;
 +}
 +
 +# Return true (1) if the BUFR message is matching all @and_criteria
 +# (to be extracted) for at least one of the @or_criteria
 +sub or_filter {
 +    my ($bufr, $or_criteria_ref) = @_;
 +
 +    my $be = $bufr->get_bufr_edition() || return 0;
 +    my $dc = $bufr->get_data_category();
 +    # Choose to equate data_subcategory with int_data_subcategory, but
 +    # not quite sure about this
 +    my $ic = ($be == 4) ? $bufr->get_int_data_subcategory()
 +                        : $bufr->get_data_subcategory();
 +    my $lc = $bufr->get_loc_data_subcategory();
 +    my $oc = $bufr->get_centre();
 +    my $os = $bufr->get_subcentre();
 +    my $mt = $bufr->get_master_table_version();
 +    my $lt = $bufr->get_local_table_version();
 +    # This will not work for edition 3 when year is before 2000,
 +    # but hard to find a better way...
 +    my $ye = ($be == 4) ? $bufr->get_year()
 +                        : $bufr->get_year_of_century + 2000;
 +    my $mo = $bufr->get_month();
 +    my $da = $bufr->get_day();
 +    my $ho = $bufr->get_hour();
 +    my $mi = $bufr->get_minute();
 +    my $se = ($be == 4) ? $bufr->get_second() : 0;
 +
 +    my $include = 0;
 +  OR:
 +    foreach my $or_criterium (@$or_criteria_ref) {
 +        my $all_ok = 1;
 +        my @and_criteria = split /\s+/, $or_criterium;
 +      AND:
 +        foreach my $and_criterium (@and_criteria) {
 +            my ($c, $list) = split /=/, $and_criterium;
 +            my @list = split /,/, $list;
 +            if ($c eq 'be') {
 +                if (not grep { $_ eq $be } @list) {
 +                    $all_ok = 0;
 +                    last AND;
 +                }
 +            } elsif ($c eq 'dc') {
 +                if (not grep { $_ eq $dc } @list) {
 +                    $all_ok = 0;
 +                    last AND;
 +                }
 +            } elsif ($c eq 'ic') {
 +                if (not grep { $_ eq $ic } @list) {
 +                    $all_ok = 0;
 +                    last AND;
 +                }
 +            } elsif ($c eq 'lc') {
 +                # Not in BUFR edition 3
 +                if (!(defined $lc) || not grep { $_ eq $lc } @list) {
 +                    $all_ok = 0;
 +                    last AND;
 +                }
 +            } elsif ($c eq 'oc') {
 +                if (not grep { $_ eq $oc } @list) {
 +                    $all_ok = 0;
 +                    last AND;
 +                }
 +            } elsif ($c eq 'os') {
 +                if (not grep { $_ eq $os } @list) {
 +                    $all_ok = 0;
 +                    last AND;
 +                }
 +            } elsif ($c eq 'mt') {
 +                if (not grep { $_ eq $mt } @list) {
 +                    $all_ok = 0;
 +                    last AND;
 +                }
 +            } elsif ($c eq 'lt') {
 +                if (not grep { $_ eq $lt } @list) {
 +                    $all_ok = 0;
 +                    last AND;
 +                }
 +            } elsif ($c eq 'ye') {
 +                if (not grep { $_ eq $ye } @list) {
 +                    $all_ok = 0;
 +                    last AND;
 +                }
 +            } elsif ($c eq 'mo') {
 +                if (not grep { $_ eq $mo } @list) {
 +                    $all_ok = 0;
 +                    last AND;
 +                }
 +            } elsif ($c eq 'da') {
 +                if (not grep { $_ eq $da } @list) {
 +                    $all_ok = 0;
 +                    last AND;
 +                }
 +            } elsif ($c eq 'ho') {
 +                if (not grep { $_ eq $ho } @list) {
 +                    $all_ok = 0;
 +                    last AND;
 +                }
 +            } elsif ($c eq 'mi') {
 +                if (not grep { $_ eq $mi } @list) {
 +                    $all_ok = 0;
 +                    last AND;
 +                }
 +            } elsif ($c eq 'se') {
 +                if (not grep { $_ eq $se } @list) {
 +                    $all_ok = 0;
 +                    last AND;
 +                }
 +            }  else {
 +                die "Metadata '$c' not recognized, check `perldoc bufrextract.pl`"
 +                    . " for the full list of 2-letter abbreviations accepted!";
 +            }
 +      } # end AND
 +        if ($all_ok == 1) {
 +            # BUFR message has met all conditions in this
 +            # or-criterium, so no need to check the others
 +            $include = 1;
 +            last OR;
 +      }
 +
 +  } # end OR
 +
 +    return $include;
 +}
 +
  
  
Line 177: Line 334:
   bufrextract.pl <bufr file(s)>   bufrextract.pl <bufr file(s)>
       [--ahl <ahl_regexp>]       [--ahl <ahl_regexp>]
-      [--only_ahl[--without_ahl[--gts]+      [--only_ahl | --without_ahl | --gts
 +      [--filter <metadata criteria>]
       [--outfile <filename>]       [--outfile <filename>]
       [--help]       [--help]
Line 185: Line 343:
  
 Extract all BUFR messages and/or corresponding AHLs from BUFR file(s), Extract all BUFR messages and/or corresponding AHLs from BUFR file(s),
-possibly filtering on AHL.+possibly filtering on AHL and/or metadata in section 1.
  
 The AHL (Abbreviated Header Line) is recognized as the TTAAii CCCC The AHL (Abbreviated Header Line) is recognized as the TTAAii CCCC
Line 202: Line 360:
    --only_ahl         Extract AHLs only    --only_ahl         Extract AHLs only
    --without_ahl      Extract BUFR messages only    --without_ahl      Extract BUFR messages only
 +   --filter <metadata criteria>
 +                      Extract BUFR messages matching the <metadata criteria> only
    --outfile <filename>    --outfile <filename>
                       Will print to <filename> instead of STDOUT                       Will print to <filename> instead of STDOUT
Line 211: Line 371:
  
 For option C<--ahl> the <ahl_regexp> should be a Perl regular For option C<--ahl> the <ahl_regexp> should be a Perl regular
-expression. E.g. C<--ahl 'ISS... ENMI'> will decode only BUFR SHIP+expression. E.g. C<--ahl 'ISS... ENMI'> will decode only SHIP BUFR
 (ISS) from CCCC=ENMI. (ISS) from CCCC=ENMI.
  
Line 219: Line 379:
 the GTS: Attachment II-4. Format of Meteorological Messages. the GTS: Attachment II-4. Format of Meteorological Messages.
  
-No bufrtables are needed for running bufrextract.pl, since section 4 +Using C<--filter> makes it possible to filter based on almost any of 
-in BUFR message will not be decoded (which also speeds up execution +the metadata present in section 1 (and 0) of the BUFR messages. Some few 
-quite a bit).+examples which hopefully are enough to illustrate how to write the 
 +<metadata criteria>: according to Common Code Table C-13 of 
 +WMO-no. 306, "dc=0 ic=0,1,2,6" should take care of synoptic and 
 +one-hour observations from fixed-land stations, while "dc=1 ic=0,6" 
 +should do the same for marine stations. If you want to extract both, 
 +use for <metadata criteria>: "dc=0 ic=0,1,2,6 | dc=1 ic=0,6"
 + 
 +Here is the full list of metadata available for filtering (the first 
 +2-letter abbreviation is what should be used in the <metadata criteria>): 
 + 
 +  be = BUFR edition 
 +  oc = Originating centre 
 +  os = Originating subcentre 
 +  dc = Data category (table A) 
 +  ic = International data subcategory 
 +  lc = Local data subcategory 
 +  mt = Master table version number 
 +  lt = Local table version number 
 +  ye = Year 
 +  mo = Month 
 +  da = Day 
 +  ho = Hour 
 +  mi = Minute 
 +  se = Second 
 + 
 +Note that no bufrtables are needed for running bufrextract.pl, since 
 +section 4 in BUFR message will not be decoded (which also speeds up 
 +execution quite a bit).
  
 =head1 HINTS =head1 HINTS
Line 229: Line 416:
 section 0-3, by making your own copy of bufrextract.pl and then section 0-3, by making your own copy of bufrextract.pl and then
 employing one of the many C<get_> subroutines in BUFR.pm. For example, employing one of the many C<get_> subroutines in BUFR.pm. For example,
-to extract only BUFR messages with data category 1, add the following+to extract only BUFR messages with TM315009, add the following
 line just before calling C<is_filtered()> in code: line just before calling C<is_filtered()> in code:
  
-  next if $bufr->get_data_category() != 1; +  next if $bufr->get_descriptors_unexpanded() ne '315009';
- +
-Or to extract BUFR messages with TM315009 only: +
- +
-  next if bufr->get_descriptors_unexpanded() ne '315009';+
  
 =head1 CAVEAT =head1 CAVEAT
Line 251: Line 434:
 =head1 COPYRIGHT =head1 COPYRIGHT
  
-Copyright (C) 2010-2025 MET Norway+Copyright (C) 2010-2026 MET Norway
  
 =cut =cut
 </code> </code>
  • bufr.pm/bufrextract.pl_source.txt
  • Last modified: 2026-03-26 17:56:54
  • by pals