User:HYanWong/categorylinks parser

From Wikimedia Commons, the free media repository
Jump to: navigation, search
/* A C program to zip through an wiki categorylink sql dump and save categories
   for each page as plain Unicode text on the correct line 
 
   An example of the output file format is:
 
 
1,'Commons-en'
2,'Commons_administrators','User_BG-2','User_PH-3','User_en','User_en-4','User_fr','User_fr-N','User_gu','User_gu-1','User_hi','User_hi-2','Wikisource_administrators'
 
 
5,'User_de','User_de-1','User_en','User_en-3','User_fr','User_fr-N','User_ja','User_ja-1'
 
7,'Inactive_Commons_pages'
8,'Commons_policies'
9,'Commons_help','Commons_image_resources','Commons_licensing_help','Commons_policies'
10,'Inactive_Commons_pages'
11,'Inactive_Commons_projects'
 
 
 
15,'User_en','User_en-N'
16,'Commons_administrators','Commons_guidelines'
 
 
19,'User_en','User_en-N','User_fr','User_fr-1'
20,'Commons_OTRS_volunteers','Commons_administrators','User_en','User_en-N'
 
 
23,'Header_templates','Internationalised_header_templates'
24,'Commons_help'
 
26,'CC-BY-SA-3.0-migrated','Callipepla_gambelii','GFDL','License_migration_completed','Self-published_work'
27,'User_ar','User_ar-1','User_en','User_en-N','User_es','User_es-N','User_ja','User_ja-2','User_pt','User_pt-2','User_sc','User_sc-1'
 
 */
 
#include <stdio.h>
#include <stdlib.h>
#include <wchar.h>
 
int main(int argc, char *argv[] )
  {
	FILE *in;
	FILE *out;
  	long long unsigned int i=0;
  	long long int page_id;
  	long long int prev_id=1;
    wint_t c;
    int quotes_in_line=0;
    int nesting=0;
    int escapes=0;
    int cutoff = 2;
    int n;
	wchar_t *buffer = NULL;
	const wchar_t *start_after=L"INSERT INTO `categorylinks` VALUES";
	buffer = malloc(wcslen(start_after+1) * sizeof(wchar_t));
 
 	printf("Reading wiki SQL dump from %s\n", argv[1]);
	in=fopen(argv[1], "r");
  	printf("Writing to %s\n", argv[2]);
	out=fopen(argv[2], "w");
 
	printf("Looking for a line starting: \"%ls\" ... ", start_after);
    do { /* discard the file up to a line beginning with the start_after string*/
    	fgetws(buffer, wcslen(start_after)+1,in);
    	if (buffer==NULL) return 2;
    } while (wcscmp(buffer, start_after));
	if (ferror(in) || feof(in)) {
		printf(" Oops - error!\n");
      	return(3);
     };
    ungetwc(')',in);
 
	printf("Found.\nStarting to read data: outputting a | every million page_ids read\n");
    fputwc('1', out); /* hack - the first one dean't print a number */
    while(c=fgetwc(in)) { /*read character-by-character */
    	switch(c) {
    		case '\\': /* backslash used to escape, e.g. quote marks */
    			escapes++;
      			if (quotes_in_line < cutoff) fputwc(c, out);
      			break;
    		case '\'': /* only bother with quotes that aren't escaped */
      			if (quotes_in_line < cutoff) fputwc(c, out);
				if ((escapes % 2) == 0) {
					nesting = 1-nesting;
					quotes_in_line++;
				};
    			escapes=0;
	    		break;
 
    		case ')': /* this could be the end of a record, if ) is not in a quoted string*/
       			if (nesting==0) {
      				quotes_in_line=0;
      				while (fgetwc(in)!='(') {/* zoom to the next record */
      					if (feof(in)) {
      						printf(" Done!\n");
      						return(0);
      					};
      					if (ferror(in)) {
							printf(" Oops- error!\n");
      						return(1);
      					};
      				}; 
      				if (fscanf(in, "%Ld", &page_id)) { /* only print out the record number on the first go */
      					if (page_id-prev_id != 0) {
      						if (page_id > prev_id) {
	      							for(n=page_id-prev_id;n;n--) {
				      					fputwc('\n', out);
		      						};
	      						fprintf(out, "%Ld", page_id);
      						} else {
								printf("Page_ids not in order for page_ids %Ld and %Ld\n", prev_id, page_id);
      						};
      						prev_id=page_id;
      					}
      				} else {
      					if (fscanf(in, ";")) { //end of this input statement
      					};
						printf("Something wrong near line %Ld\n", prev_id); 
						return 1;
      				};
      				if ((++i % 100000) == 0) {(i % 1000000) ? printf(".") : printf("|"); fflush(stdout);};
      			}
    			escapes=0;
	    		break;
    		case WEOF:
    			printf(" Oops. Come to the end of the file but haven't finished a record. Is the file truncated?\n");
	    		return(1);
    		default:
      			if (quotes_in_line < cutoff) fputwc(c, out);
	    		escapes=0;
    		}
      }
      fclose(in);
      fclose(out);
    return 0;
  }